Bug Summary

File:out/../deps/icu-small/source/i18n/rematch.cpp
Warning:line 4416, column 36
Dereference of null pointer

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name rematch.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_I18N_IMPLEMENTATION=1 -D U_IO_IMPLEMENTATION=1 -D U_TOOLUTIL_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -I ../deps/icu-small/source/i18n -I ../deps/icu-small/source/tools/toolutil -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/i18n/rematch.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**************************************************************************
5* Copyright (C) 2002-2016 International Business Machines Corporation
6* and others. All rights reserved.
7**************************************************************************
8*/
9//
10// file: rematch.cpp
11//
12// Contains the implementation of class RegexMatcher,
13// which is one of the main API classes for the ICU regular expression package.
14//
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS0
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
23#include "unicode/rbbi.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "uassert.h"
27#include "cmemory.h"
28#include "cstr.h"
29#include "uvector.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "regeximp.h"
33#include "regexst.h"
34#include "regextxt.h"
35#include "ucase.h"
36
37// #include <malloc.h> // Needed for heapcheck testing
38
39
40U_NAMESPACE_BEGINnamespace icu_71 {
41
42// Default limit for the size of the back track stack, to avoid system
43// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45// which use recursion rather than the heap, and take more storage per
46// backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51// Time limits for expression evaluation are in terms of quanta of work by
52// the engine, each of which is 10,000 state saves.
53// This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
65//-----------------------------------------------------------------------------
66//
67// Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
70RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL__null) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
97 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_closeutext_close_71(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE1;
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119}
120
121
122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
137RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151}
152
153
154
155
156RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_freeuprv_free_71(fData);
160 fData = NULL__null;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL__null;
165 fPattern = NULL__null;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_closeutext_close_71(fInputText);
173 }
174 if (fAltInputText) {
175 utext_closeutext_close_71(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION0==0
179 delete fWordBreakItr;
180 delete fGCBreakItr;
181 #endif
182}
183
184//
185// init() common initialization for use by all constructors.
186// Initialize all fields, get the object into a consistent state.
187// This must be done even when the initial status shows an error,
188// so that the object is initialized sufficiently well for the destructor
189// to run safely.
190//
191void RegexMatcher::init(UErrorCode &status) {
192 fPattern = NULL__null;
193 fPatternOwned = NULL__null;
194 fFrameSize = 0;
195 fRegionStart = 0;
196 fRegionLimit = 0;
197 fAnchorStart = 0;
198 fAnchorLimit = 0;
199 fLookStart = 0;
200 fLookLimit = 0;
201 fActiveStart = 0;
202 fActiveLimit = 0;
203 fTransparentBounds = FALSE0;
204 fAnchoringBounds = TRUE1;
205 fMatch = FALSE0;
206 fMatchStart = 0;
207 fMatchEnd = 0;
208 fLastMatchEnd = -1;
209 fAppendPosition = 0;
210 fHitEnd = FALSE0;
211 fRequireEnd = FALSE0;
212 fStack = NULL__null;
213 fFrame = NULL__null;
214 fTimeLimit = 0;
215 fTime = 0;
216 fTickCounter = 0;
217 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
218 fCallbackFn = NULL__null;
219 fCallbackContext = NULL__null;
220 fFindProgressCallbackFn = NULL__null;
221 fFindProgressCallbackContext = NULL__null;
222 fTraceDebug = FALSE0;
223 fDeferredStatus = status;
224 fData = fSmallData;
225 fWordBreakItr = NULL__null;
226 fGCBreakItr = NULL__null;
227
228 fStack = NULL__null;
229 fInputText = NULL__null;
230 fAltInputText = NULL__null;
231 fInput = NULL__null;
232 fInputLength = 0;
233 fInputUniStrMaybeMutable = FALSE0;
234}
235
236//
237// init2() Common initialization for use by RegexMatcher constructors, part 2.
238// This handles the common setup to be done after the Pattern is available.
239//
240void RegexMatcher::init2(UText *input, UErrorCode &status) {
241 if (U_FAILURE(status)) {
242 fDeferredStatus = status;
243 return;
244 }
245
246 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)(int32_t)(sizeof(fSmallData)/sizeof((fSmallData)[0]))) {
247 fData = (int64_t *)uprv_mallocuprv_malloc_71(fPattern->fDataSize * sizeof(int64_t));
248 if (fData == NULL__null) {
249 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250 return;
251 }
252 }
253
254 fStack = new UVector64(status);
255 if (fStack == NULL__null) {
256 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257 return;
258 }
259
260 reset(input);
261 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262 if (U_FAILURE(status)) {
263 fDeferredStatus = status;
264 return;
265 }
266}
267
268
269static const UChar BACKSLASH = 0x5c;
270static const UChar DOLLARSIGN = 0x24;
271static const UChar LEFTBRACKET = 0x7b;
272static const UChar RIGHTBRACKET = 0x7d;
273
274//--------------------------------------------------------------------------------
275//
276// appendReplacement
277//
278//--------------------------------------------------------------------------------
279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
283
284 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
287 utext_openUnicodeStringutext_openUnicodeString_71(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_closeutext_close_71(&resultText);
292 }
293 utext_closeutext_close_71(&replacementText);
294 }
295
296 return *this;
297}
298
299//
300// appendReplacement, UText mode
301//
302RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE0) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLengthutext_nativeLength_71(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
321 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extractutext_extract_71(fInputText, fAppendPosition, fMatchStart, NULL__null, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL__null) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extractutext_extract_71(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_freeuprv_free_71(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0)do { int64_t __offset = (0) - (replacement)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(replacement
)->nativeIndexingLimit && (replacement)->chunkContents
[__offset]<0xdc00) { (replacement)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((replacement), (0
)); } } while (false)
;
348 for (UChar32 c = UTEXT_NEXT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)++] : utext_next32_71(replacement
))
; U_SUCCESS(status) && c != U_SENTINEL(-1); c = UTEXT_NEXT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)++] : utext_next32_71(replacement
))
) {
349 if (c == BACKSLASH) {
350 // Backslash Escape. Copy the following char out without further checks.
351 // Note: Surrogate pairs don't need any special handling
352 // The second half wont be a '$' or a '\', and
353 // will move to the dest normally on the next
354 // loop iteration.
355 c = UTEXT_CURRENT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)] : utext_current32_71(replacement
))
;
356 if (c == U_SENTINEL(-1)) {
357 break;
358 }
359
360 if (c==0x55/*U*/ || c==0x75/*u*/) {
361 // We have a \udddd or \Udddddddd escape sequence.
362 int32_t offset = 0;
363 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement){ (replacement), -1 };
364 UChar32 escapedChar = u_unescapeAtu_unescapeAt_71(uregex_utext_unescape_charAturegex_utext_unescape_charAt_71, &offset, INT32_MAX(2147483647), &context);
365 if (escapedChar != (UChar32)0xFFFFFFFF) {
366 if (U_IS_BMP(escapedChar)((uint32_t)(escapedChar)<=0xffff)) {
367 UChar c16 = (UChar)escapedChar;
368 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, &c16, 1, &status);
369 } else {
370 UChar surrogate[2];
371 surrogate[0] = U16_LEAD(escapedChar)(UChar)(((escapedChar)>>10)+0xd7c0);
372 surrogate[1] = U16_TRAIL(escapedChar)(UChar)(((escapedChar)&0x3ff)|0xdc00);
373 if (U_SUCCESS(status)) {
374 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, surrogate, 2, &status);
375 }
376 }
377 // TODO: Report errors for mal-formed \u escapes?
378 // As this is, the original sequence is output, which may be OK.
379 if (context.lastOffset == offset) {
380 (void)UTEXT_PREVIOUS32(replacement)((replacement)->chunkOffset > 0 && (replacement
)->chunkContents[(replacement)->chunkOffset-1] < 0xd800
? (replacement)->chunkContents[--((replacement)->chunkOffset
)] : utext_previous32_71(replacement))
;
381 } else if (context.lastOffset != offset-1) {
382 utext_moveIndex32utext_moveIndex32_71(replacement, offset - context.lastOffset - 1);
383 }
384 }
385 } else {
386 (void)UTEXT_NEXT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)++] : utext_next32_71(replacement
))
;
387 // Plain backslash escape. Just put out the escaped character.
388 if (U_IS_BMP(c)((uint32_t)(c)<=0xffff)) {
389 UChar c16 = (UChar)c;
390 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, &c16, 1, &status);
391 } else {
392 UChar surrogate[2];
393 surrogate[0] = U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);
394 surrogate[1] = U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);
395 if (U_SUCCESS(status)) {
396 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, surrogate, 2, &status);
397 }
398 }
399 }
400 } else if (c != DOLLARSIGN) {
401 // Normal char, not a $. Copy it out without further checks.
402 if (U_IS_BMP(c)((uint32_t)(c)<=0xffff)) {
403 UChar c16 = (UChar)c;
404 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, &c16, 1, &status);
405 } else {
406 UChar surrogate[2];
407 surrogate[0] = U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);
408 surrogate[1] = U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);
409 if (U_SUCCESS(status)) {
410 destLen += utext_replaceutext_replace_71(dest, destLen, destLen, surrogate, 2, &status);
411 }
412 }
413 } else {
414 // We've got a $. Pick up a capture group name or number if one follows.
415 // Consume digits so long as the resulting group number <= the number of
416 // number of capture groups in the pattern.
417
418 int32_t groupNum = 0;
419 int32_t numDigits = 0;
420 UChar32 nextChar = utext_current32utext_current32_71(replacement);
421 if (nextChar == LEFTBRACKET) {
422 // Scan for a Named Capture Group, ${name}.
423 UnicodeString groupName;
424 utext_next32utext_next32_71(replacement);
425 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426 nextChar = utext_next32utext_next32_71(replacement);
427 if (nextChar == U_SENTINEL(-1)) {
428 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
430 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
431 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
432 groupName.append(nextChar);
433 } else if (nextChar == RIGHTBRACKET) {
434 groupNum = fPattern->fNamedCaptureMap ? uhash_getiuhash_geti_71(fPattern->fNamedCaptureMap, &groupName) : 0;
435 if (groupNum == 0) {
436 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437 }
438 } else {
439 // Character was something other than a name char or a closing '}'
440 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441 }
442 }
443
444 } else if (u_isdigitu_isdigit_71(nextChar)) {
445 // $n Scan for a capture group number
446 int32_t numCaptureGroups = fPattern->fGroupMap->size();
447 for (;;) {
448 nextChar = UTEXT_CURRENT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)] : utext_current32_71(replacement
))
;
449 if (nextChar == U_SENTINEL(-1)) {
450 break;
451 }
452 if (u_isdigitu_isdigit_71(nextChar) == FALSE0) {
453 break;
454 }
455 int32_t nextDigitVal = u_charDigitValueu_charDigitValue_71(nextChar);
456 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457 // Don't consume the next digit if it makes the capture group number too big.
458 if (numDigits == 0) {
459 status = U_INDEX_OUTOFBOUNDS_ERROR;
460 }
461 break;
462 }
463 (void)UTEXT_NEXT32(replacement)((replacement)->chunkOffset < (replacement)->chunkLength
&& ((replacement)->chunkContents)[(replacement)->
chunkOffset]<0xd800 ? ((replacement)->chunkContents)[((
replacement)->chunkOffset)++] : utext_next32_71(replacement
))
;
464 groupNum=groupNum*10 + nextDigitVal;
465 ++numDigits;
466 }
467 } else {
468 // $ not followed by capture group name or number.
469 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470 }
471
472 if (U_SUCCESS(status)) {
473 destLen += appendGroup(groupNum, dest, status);
474 }
475 } // End of $ capture group handling
476 } // End of per-character loop through the replacement string.
477
478 return *this;
479}
480
481
482
483//--------------------------------------------------------------------------------
484//
485// appendTail Intended to be used in conjunction with appendReplacement()
486// To the destination string, append everything following
487// the last match position from the input string.
488//
489// Note: Match ranges do not affect appendTail or appendReplacement
490//
491//--------------------------------------------------------------------------------
492UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493 UErrorCode status = U_ZERO_ERROR;
494 UText resultText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
495 utext_openUnicodeStringutext_openUnicodeString_71(&resultText, &dest, &status);
496
497 if (U_SUCCESS(status)) {
498 appendTail(&resultText, status);
499 utext_closeutext_close_71(&resultText);
500 }
501
502 return dest;
503}
504
505//
506// appendTail, UText mode
507//
508UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509 if (U_FAILURE(status)) {
510 return dest;
511 }
512 if (U_FAILURE(fDeferredStatus)) {
513 status = fDeferredStatus;
514 return dest;
515 }
516
517 if (fInputLength > fAppendPosition) {
518 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
519 int64_t destLen = utext_nativeLengthutext_nativeLength_71(dest);
520 utext_replaceutext_replace_71(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521 (int32_t)(fInputLength-fAppendPosition), &status);
522 } else {
523 int32_t len16;
524 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
525 len16 = (int32_t)(fInputLength-fAppendPosition);
526 } else {
527 len16 = utext_extractutext_extract_71(fInputText, fAppendPosition, fInputLength, NULL__null, 0, &status);
528 status = U_ZERO_ERROR; // buffer overflow
529 }
530
531 UChar *inputChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(len16));
532 if (inputChars == NULL__null) {
533 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534 } else {
535 utext_extractutext_extract_71(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536 int64_t destLen = utext_nativeLengthutext_nativeLength_71(dest);
537 utext_replaceutext_replace_71(dest, destLen, destLen, inputChars, len16, &status);
538 uprv_freeuprv_free_71(inputChars);
539 }
540 }
541 }
542 return dest;
543}
544
545
546
547//--------------------------------------------------------------------------------
548//
549// end
550//
551//--------------------------------------------------------------------------------
552int32_t RegexMatcher::end(UErrorCode &err) const {
553 return end(0, err);
554}
555
556int64_t RegexMatcher::end64(UErrorCode &err) const {
557 return end64(0, err);
558}
559
560int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561 if (U_FAILURE(err)) {
562 return -1;
563 }
564 if (fMatch == FALSE0) {
565 err = U_REGEX_INVALID_STATE;
566 return -1;
567 }
568 if (group < 0 || group > fPattern->fGroupMap->size()) {
569 err = U_INDEX_OUTOFBOUNDS_ERROR;
570 return -1;
571 }
572 int64_t e = -1;
573 if (group == 0) {
574 e = fMatchEnd;
575 } else {
576 // Get the position within the stack frame of the variables for
577 // this capture group.
578 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579 U_ASSERT(groupOffset < fPattern->fFrameSize)(void)0;
580 U_ASSERT(groupOffset >= 0)(void)0;
581 e = fFrame->fExtra[groupOffset + 1];
582 }
583
584 return e;
585}
586
587int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588 return (int32_t)end64(group, err);
589}
590
591//--------------------------------------------------------------------------------
592//
593// findProgressInterrupt This function is called once for each advance in the target
594// string from the find() function, and calls the user progress callback
595// function if there is one installed.
596//
597// Return: TRUE if the find operation is to be terminated.
598// FALSE if the find operation is to continue running.
599//
600//--------------------------------------------------------------------------------
601UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603 status = U_REGEX_STOPPED_BY_CALLER;
604 return TRUE1;
605 }
606 return FALSE0;
607}
608
609//--------------------------------------------------------------------------------
610//
611// find()
612//
613//--------------------------------------------------------------------------------
614UBool RegexMatcher::find() {
615 if (U_FAILURE(fDeferredStatus)) {
616 return FALSE0;
617 }
618 UErrorCode status = U_ZERO_ERROR;
619 UBool result = find(status);
620 return result;
621}
622
623//--------------------------------------------------------------------------------
624//
625// find()
626//
627//--------------------------------------------------------------------------------
628UBool RegexMatcher::find(UErrorCode &status) {
629 // Start at the position of the last match end. (Will be zero if the
630 // matcher has been reset.)
631 //
632 if (U_FAILURE(status)) {
633 return FALSE0;
634 }
635 if (U_FAILURE(fDeferredStatus)) {
636 status = fDeferredStatus;
637 return FALSE0;
638 }
639
640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
641 return findUsingChunk(status);
642 }
643
644 int64_t startPos = fMatchEnd;
645 if (startPos==0) {
646 startPos = fActiveStart;
647 }
648
649 if (fMatch) {
650 // Save the position of any previous successful match.
651 fLastMatchEnd = fMatchEnd;
652
653 if (fMatchStart == fMatchEnd) {
654 // Previous match had zero length. Move start position up one position
655 // to avoid sending find() into a loop on zero-length matches.
656 if (startPos >= fActiveLimit) {
657 fMatch = FALSE0;
658 fHitEnd = TRUE1;
659 return FALSE0;
660 }
661 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
662 (void)UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
663 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
664 }
665 } else {
666 if (fLastMatchEnd >= 0) {
667 // A previous find() failed to match. Don't try again.
668 // (without this test, a pattern with a zero-length match
669 // could match again at the end of an input string.)
670 fHitEnd = TRUE1;
671 return FALSE0;
672 }
673 }
674
675
676 // Compute the position in the input string beyond which a match can not begin, because
677 // the minimum length match would extend past the end of the input.
678 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
679 // Be aware of possible overflows if making changes here.
680 int64_t testStartLimit;
681 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
682 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683 if (startPos > testStartLimit) {
684 fMatch = FALSE0;
685 fHitEnd = TRUE1;
686 return FALSE0;
687 }
688 } else {
689 // We don't know exactly how long the minimum match length is in native characters.
690 // Treat anything > 0 as 1.
691 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692 }
693
694 UChar32 c;
695 U_ASSERT(startPos >= 0)(void)0;
696
697 switch (fPattern->fStartType) {
698 case START_NO_INFO:
699 // No optimization was found.
700 // Try a match at each input position.
701 for (;;) {
702 MatchAt(startPos, FALSE0, status);
703 if (U_FAILURE(status)) {
704 return FALSE0;
705 }
706 if (fMatch) {
707 return TRUE1;
708 }
709 if (startPos >= testStartLimit) {
710 fHitEnd = TRUE1;
711 return FALSE0;
712 }
713 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
714 (void)UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
715 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
716 // Note that it's perfectly OK for a pattern to have a zero-length
717 // match at the end of a string, so we must make sure that the loop
718 // runs with startPos == testStartLimit the last time through.
719 if (findProgressInterrupt(startPos, status))
720 return FALSE0;
721 }
722 UPRV_UNREACHABLE_EXITabort();
723
724 case START_START:
725 // Matches are only possible at the start of the input string
726 // (pattern begins with ^ or \A)
727 if (startPos > fActiveStart) {
728 fMatch = FALSE0;
729 return FALSE0;
730 }
731 MatchAt(startPos, FALSE0, status);
732 if (U_FAILURE(status)) {
733 return FALSE0;
734 }
735 return fMatch;
736
737
738 case START_SET:
739 {
740 // Match may start on any char from a pre-computed set.
741 U_ASSERT(fPattern->fMinMatchLen > 0)(void)0;
742 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
743 for (;;) {
744 int64_t pos = startPos;
745 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
746 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
747 // c will be -1 (U_SENTINEL) at end of text, in which case we
748 // skip this next block (so we don't have a negative array index)
749 // and handle end of text in the following block.
750 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751 (c>=256 && fPattern->fInitialChars->contains(c)))) {
752 MatchAt(pos, FALSE0, status);
753 if (U_FAILURE(status)) {
754 return FALSE0;
755 }
756 if (fMatch) {
757 return TRUE1;
758 }
759 UTEXT_SETNATIVEINDEX(fInputText, pos)do { int64_t __offset = (pos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (pos
)); } } while (false)
;
760 }
761 if (startPos > testStartLimit) {
762 fMatch = FALSE0;
763 fHitEnd = TRUE1;
764 return FALSE0;
765 }
766 if (findProgressInterrupt(startPos, status))
767 return FALSE0;
768 }
769 }
770 UPRV_UNREACHABLE_EXITabort();
771
772 case START_STRING:
773 case START_CHAR:
774 {
775 // Match starts on exactly one char.
776 U_ASSERT(fPattern->fMinMatchLen > 0)(void)0;
777 UChar32 theChar = fPattern->fInitialChar;
778 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
779 for (;;) {
780 int64_t pos = startPos;
781 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
782 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
783 if (c == theChar) {
784 MatchAt(pos, FALSE0, status);
785 if (U_FAILURE(status)) {
786 return FALSE0;
787 }
788 if (fMatch) {
789 return TRUE1;
790 }
791 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
792 }
793 if (startPos > testStartLimit) {
794 fMatch = FALSE0;
795 fHitEnd = TRUE1;
796 return FALSE0;
797 }
798 if (findProgressInterrupt(startPos, status))
799 return FALSE0;
800 }
801 }
802 UPRV_UNREACHABLE_EXITabort();
803
804 case START_LINE:
805 {
806 UChar32 ch;
807 if (startPos == fAnchorStart) {
808 MatchAt(startPos, FALSE0, status);
809 if (U_FAILURE(status)) {
810 return FALSE0;
811 }
812 if (fMatch) {
813 return TRUE1;
814 }
815 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
816 ch = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
817 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
818 } else {
819 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
820 ch = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
821 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
822 }
823
824 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825 for (;;) {
826 if (ch == 0x0a) {
827 MatchAt(startPos, FALSE0, status);
828 if (U_FAILURE(status)) {
829 return FALSE0;
830 }
831 if (fMatch) {
832 return TRUE1;
833 }
834 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
835 }
836 if (startPos >= testStartLimit) {
837 fMatch = FALSE0;
838 fHitEnd = TRUE1;
839 return FALSE0;
840 }
841 ch = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
842 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
843 // Note that it's perfectly OK for a pattern to have a zero-length
844 // match at the end of a string, so we must make sure that the loop
845 // runs with startPos == testStartLimit the last time through.
846 if (findProgressInterrupt(startPos, status))
847 return FALSE0;
848 }
849 } else {
850 for (;;) {
851 if (isLineTerminator(ch)) {
852 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
== 0x0a) {
853 (void)UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
854 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
855 }
856 MatchAt(startPos, FALSE0, status);
857 if (U_FAILURE(status)) {
858 return FALSE0;
859 }
860 if (fMatch) {
861 return TRUE1;
862 }
863 UTEXT_SETNATIVEINDEX(fInputText, startPos)do { int64_t __offset = (startPos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (startPos
)); } } while (false)
;
864 }
865 if (startPos >= testStartLimit) {
866 fMatch = FALSE0;
867 fHitEnd = TRUE1;
868 return FALSE0;
869 }
870 ch = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
871 startPos = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
872 // Note that it's perfectly OK for a pattern to have a zero-length
873 // match at the end of a string, so we must make sure that the loop
874 // runs with startPos == testStartLimit the last time through.
875 if (findProgressInterrupt(startPos, status))
876 return FALSE0;
877 }
878 }
879 }
880
881 default:
882 UPRV_UNREACHABLE_ASSERT(void)0;
883 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
884 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
885 // See ICU-21669.
886 status = U_INTERNAL_PROGRAM_ERROR;
887 return FALSE0;
888 }
889
890 UPRV_UNREACHABLE_EXITabort();
891}
892
893
894
895UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
896 if (U_FAILURE(status)) {
897 return FALSE0;
898 }
899 if (U_FAILURE(fDeferredStatus)) {
900 status = fDeferredStatus;
901 return FALSE0;
902 }
903 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
904 // This will reset the region to be the full input length.
905 if (start < 0) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE0;
908 }
909
910 int64_t nativeStart = start;
911 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
912 status = U_INDEX_OUTOFBOUNDS_ERROR;
913 return FALSE0;
914 }
915 fMatchEnd = nativeStart;
916 return find(status);
917}
918
919
920//--------------------------------------------------------------------------------
921//
922// findUsingChunk() -- like find(), but with the advance knowledge that the
923// entire string is available in the UText's chunk buffer.
924//
925//--------------------------------------------------------------------------------
926UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
927 // Start at the position of the last match end. (Will be zero if the
928 // matcher has been reset.
929 //
930
931 int32_t startPos = (int32_t)fMatchEnd;
932 if (startPos==0) {
933 startPos = (int32_t)fActiveStart;
934 }
935
936 const UChar *inputBuf = fInputText->chunkContents;
937
938 if (fMatch) {
939 // Save the position of any previous successful match.
940 fLastMatchEnd = fMatchEnd;
941
942 if (fMatchStart == fMatchEnd) {
943 // Previous match had zero length. Move start position up one position
944 // to avoid sending find() into a loop on zero-length matches.
945 if (startPos >= fActiveLimit) {
946 fMatch = FALSE0;
947 fHitEnd = TRUE1;
948 return FALSE0;
949 }
950 U16_FWD_1(inputBuf, startPos, fInputLength)do { if(((((inputBuf)[(startPos)++])&0xfffffc00)==0xd800)
&& (startPos)!=(fInputLength) && ((((inputBuf
)[startPos])&0xfffffc00)==0xdc00)) { ++(startPos); } } while
(false)
;
951 }
952 } else {
953 if (fLastMatchEnd >= 0) {
954 // A previous find() failed to match. Don't try again.
955 // (without this test, a pattern with a zero-length match
956 // could match again at the end of an input string.)
957 fHitEnd = TRUE1;
958 return FALSE0;
959 }
960 }
961
962
963 // Compute the position in the input string beyond which a match can not begin, because
964 // the minimum length match would extend past the end of the input.
965 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
966 // Be aware of possible overflows if making changes here.
967 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
968 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
969 if (startPos > testLen) {
970 fMatch = FALSE0;
971 fHitEnd = TRUE1;
972 return FALSE0;
973 }
974
975 UChar32 c;
976 U_ASSERT(startPos >= 0)(void)0;
977
978 switch (fPattern->fStartType) {
979 case START_NO_INFO:
980 // No optimization was found.
981 // Try a match at each input position.
982 for (;;) {
983 MatchChunkAt(startPos, FALSE0, status);
984 if (U_FAILURE(status)) {
985 return FALSE0;
986 }
987 if (fMatch) {
988 return TRUE1;
989 }
990 if (startPos >= testLen) {
991 fHitEnd = TRUE1;
992 return FALSE0;
993 }
994 U16_FWD_1(inputBuf, startPos, fActiveLimit)do { if(((((inputBuf)[(startPos)++])&0xfffffc00)==0xd800)
&& (startPos)!=(fActiveLimit) && ((((inputBuf
)[startPos])&0xfffffc00)==0xdc00)) { ++(startPos); } } while
(false)
;
995 // Note that it's perfectly OK for a pattern to have a zero-length
996 // match at the end of a string, so we must make sure that the loop
997 // runs with startPos == testLen the last time through.
998 if (findProgressInterrupt(startPos, status))
999 return FALSE0;
1000 }
1001 UPRV_UNREACHABLE_EXITabort();
1002
1003 case START_START:
1004 // Matches are only possible at the start of the input string
1005 // (pattern begins with ^ or \A)
1006 if (startPos > fActiveStart) {
1007 fMatch = FALSE0;
1008 return FALSE0;
1009 }
1010 MatchChunkAt(startPos, FALSE0, status);
1011 if (U_FAILURE(status)) {
1012 return FALSE0;
1013 }
1014 return fMatch;
1015
1016
1017 case START_SET:
1018 {
1019 // Match may start on any char from a pre-computed set.
1020 U_ASSERT(fPattern->fMinMatchLen > 0)(void)0;
1021 for (;;) {
1022 int32_t pos = startPos;
1023 U16_NEXT(inputBuf, startPos, fActiveLimit, c)do { (c)=(inputBuf)[(startPos)++]; if((((c)&0xfffffc00)==
0xd800)) { uint16_t __c2; if((startPos)!=(fActiveLimit) &&
(((__c2=(inputBuf)[(startPos)])&0xfffffc00)==0xdc00)) { ++
(startPos); (c)=(((UChar32)((c))<<10UL)+(UChar32)(__c2)
-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (false)
; // like c = inputBuf[startPos++];
1024 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1025 (c>=256 && fPattern->fInitialChars->contains(c))) {
1026 MatchChunkAt(pos, FALSE0, status);
1027 if (U_FAILURE(status)) {
1028 return FALSE0;
1029 }
1030 if (fMatch) {
1031 return TRUE1;
1032 }
1033 }
1034 if (startPos > testLen) {
1035 fMatch = FALSE0;
1036 fHitEnd = TRUE1;
1037 return FALSE0;
1038 }
1039 if (findProgressInterrupt(startPos, status))
1040 return FALSE0;
1041 }
1042 }
1043 UPRV_UNREACHABLE_EXITabort();
1044
1045 case START_STRING:
1046 case START_CHAR:
1047 {
1048 // Match starts on exactly one char.
1049 U_ASSERT(fPattern->fMinMatchLen > 0)(void)0;
1050 UChar32 theChar = fPattern->fInitialChar;
1051 for (;;) {
1052 int32_t pos = startPos;
1053 U16_NEXT(inputBuf, startPos, fActiveLimit, c)do { (c)=(inputBuf)[(startPos)++]; if((((c)&0xfffffc00)==
0xd800)) { uint16_t __c2; if((startPos)!=(fActiveLimit) &&
(((__c2=(inputBuf)[(startPos)])&0xfffffc00)==0xdc00)) { ++
(startPos); (c)=(((UChar32)((c))<<10UL)+(UChar32)(__c2)
-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (false)
; // like c = inputBuf[startPos++];
1054 if (c == theChar) {
1055 MatchChunkAt(pos, FALSE0, status);
1056 if (U_FAILURE(status)) {
1057 return FALSE0;
1058 }
1059 if (fMatch) {
1060 return TRUE1;
1061 }
1062 }
1063 if (startPos > testLen) {
1064 fMatch = FALSE0;
1065 fHitEnd = TRUE1;
1066 return FALSE0;
1067 }
1068 if (findProgressInterrupt(startPos, status))
1069 return FALSE0;
1070 }
1071 }
1072 UPRV_UNREACHABLE_EXITabort();
1073
1074 case START_LINE:
1075 {
1076 UChar32 ch;
1077 if (startPos == fAnchorStart) {
1078 MatchChunkAt(startPos, FALSE0, status);
1079 if (U_FAILURE(status)) {
1080 return FALSE0;
1081 }
1082 if (fMatch) {
1083 return TRUE1;
1084 }
1085 U16_FWD_1(inputBuf, startPos, fActiveLimit)do { if(((((inputBuf)[(startPos)++])&0xfffffc00)==0xd800)
&& (startPos)!=(fActiveLimit) && ((((inputBuf
)[startPos])&0xfffffc00)==0xdc00)) { ++(startPos); } } while
(false)
;
1086 }
1087
1088 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1089 for (;;) {
1090 ch = inputBuf[startPos-1];
1091 if (ch == 0x0a) {
1092 MatchChunkAt(startPos, FALSE0, status);
1093 if (U_FAILURE(status)) {
1094 return FALSE0;
1095 }
1096 if (fMatch) {
1097 return TRUE1;
1098 }
1099 }
1100 if (startPos >= testLen) {
1101 fMatch = FALSE0;
1102 fHitEnd = TRUE1;
1103 return FALSE0;
1104 }
1105 U16_FWD_1(inputBuf, startPos, fActiveLimit)do { if(((((inputBuf)[(startPos)++])&0xfffffc00)==0xd800)
&& (startPos)!=(fActiveLimit) && ((((inputBuf
)[startPos])&0xfffffc00)==0xdc00)) { ++(startPos); } } while
(false)
;
1106 // Note that it's perfectly OK for a pattern to have a zero-length
1107 // match at the end of a string, so we must make sure that the loop
1108 // runs with startPos == testLen the last time through.
1109 if (findProgressInterrupt(startPos, status))
1110 return FALSE0;
1111 }
1112 } else {
1113 for (;;) {
1114 ch = inputBuf[startPos-1];
1115 if (isLineTerminator(ch)) {
1116 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1117 startPos++;
1118 }
1119 MatchChunkAt(startPos, FALSE0, status);
1120 if (U_FAILURE(status)) {
1121 return FALSE0;
1122 }
1123 if (fMatch) {
1124 return TRUE1;
1125 }
1126 }
1127 if (startPos >= testLen) {
1128 fMatch = FALSE0;
1129 fHitEnd = TRUE1;
1130 return FALSE0;
1131 }
1132 U16_FWD_1(inputBuf, startPos, fActiveLimit)do { if(((((inputBuf)[(startPos)++])&0xfffffc00)==0xd800)
&& (startPos)!=(fActiveLimit) && ((((inputBuf
)[startPos])&0xfffffc00)==0xdc00)) { ++(startPos); } } while
(false)
;
1133 // Note that it's perfectly OK for a pattern to have a zero-length
1134 // match at the end of a string, so we must make sure that the loop
1135 // runs with startPos == testLen the last time through.
1136 if (findProgressInterrupt(startPos, status))
1137 return FALSE0;
1138 }
1139 }
1140 }
1141
1142 default:
1143 UPRV_UNREACHABLE_ASSERT(void)0;
1144 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1145 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1146 // See ICU-21669.
1147 status = U_INTERNAL_PROGRAM_ERROR;
1148 return FALSE0;
1149 }
1150
1151 UPRV_UNREACHABLE_EXITabort();
1152}
1153
1154
1155
1156//--------------------------------------------------------------------------------
1157//
1158// group()
1159//
1160//--------------------------------------------------------------------------------
1161UnicodeString RegexMatcher::group(UErrorCode &status) const {
1162 return group(0, status);
1163}
1164
1165// Return immutable shallow clone
1166UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1167 return group(0, dest, group_len, status);
1168}
1169
1170// Return immutable shallow clone
1171UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1172 group_len = 0;
1173 if (U_FAILURE(status)) {
1174 return dest;
1175 }
1176 if (U_FAILURE(fDeferredStatus)) {
1177 status = fDeferredStatus;
1178 } else if (fMatch == FALSE0) {
1179 status = U_REGEX_INVALID_STATE;
1180 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1181 status = U_INDEX_OUTOFBOUNDS_ERROR;
1182 }
1183
1184 if (U_FAILURE(status)) {
1185 return dest;
1186 }
1187
1188 int64_t s, e;
1189 if (groupNum == 0) {
1190 s = fMatchStart;
1191 e = fMatchEnd;
1192 } else {
1193 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1194 U_ASSERT(groupOffset < fPattern->fFrameSize)(void)0;
1195 U_ASSERT(groupOffset >= 0)(void)0;
1196 s = fFrame->fExtra[groupOffset];
1197 e = fFrame->fExtra[groupOffset+1];
1198 }
1199
1200 if (s < 0) {
1201 // A capture group wasn't part of the match
1202 return utext_cloneutext_clone_71(dest, fInputText, FALSE0, TRUE1, &status);
1203 }
1204 U_ASSERT(s <= e)(void)0;
1205 group_len = e - s;
1206
1207 dest = utext_cloneutext_clone_71(dest, fInputText, FALSE0, TRUE1, &status);
1208 if (dest)
1209 UTEXT_SETNATIVEINDEX(dest, s)do { int64_t __offset = (s) - (dest)->chunkNativeStart; if
(__offset>=0 && __offset<(int64_t)(dest)->nativeIndexingLimit
&& (dest)->chunkContents[__offset]<0xdc00) { (
dest)->chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71
((dest), (s)); } } while (false)
;
1210 return dest;
1211}
1212
1213UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1214 UnicodeString result;
1215 int64_t groupStart = start64(groupNum, status);
1216 int64_t groupEnd = end64(groupNum, status);
1217 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1218 return result;
1219 }
1220
1221 // Get the group length using a utext_extract preflight.
1222 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1223 int32_t length = utext_extractutext_extract_71(fInputText, groupStart, groupEnd, NULL__null, 0, &status);
1224 if (status != U_BUFFER_OVERFLOW_ERROR) {
1225 return result;
1226 }
1227
1228 status = U_ZERO_ERROR;
1229 UChar *buf = result.getBuffer(length);
1230 if (buf == NULL__null) {
1231 status = U_MEMORY_ALLOCATION_ERROR;
1232 } else {
1233 int32_t extractLength = utext_extractutext_extract_71(fInputText, groupStart, groupEnd, buf, length, &status);
1234 result.releaseBuffer(extractLength);
1235 U_ASSERT(length == extractLength)(void)0;
1236 }
1237 return result;
1238}
1239
1240
1241//--------------------------------------------------------------------------------
1242//
1243// appendGroup() -- currently internal only, appends a group to a UText rather
1244// than replacing its contents
1245//
1246//--------------------------------------------------------------------------------
1247
1248int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1249 if (U_FAILURE(status)) {
1250 return 0;
1251 }
1252 if (U_FAILURE(fDeferredStatus)) {
1253 status = fDeferredStatus;
1254 return 0;
1255 }
1256 int64_t destLen = utext_nativeLengthutext_nativeLength_71(dest);
1257
1258 if (fMatch == FALSE0) {
1259 status = U_REGEX_INVALID_STATE;
1260 return utext_replaceutext_replace_71(dest, destLen, destLen, NULL__null, 0, &status);
1261 }
1262 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1263 status = U_INDEX_OUTOFBOUNDS_ERROR;
1264 return utext_replaceutext_replace_71(dest, destLen, destLen, NULL__null, 0, &status);
1265 }
1266
1267 int64_t s, e;
1268 if (groupNum == 0) {
1269 s = fMatchStart;
1270 e = fMatchEnd;
1271 } else {
1272 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1273 U_ASSERT(groupOffset < fPattern->fFrameSize)(void)0;
1274 U_ASSERT(groupOffset >= 0)(void)0;
1275 s = fFrame->fExtra[groupOffset];
1276 e = fFrame->fExtra[groupOffset+1];
1277 }
1278
1279 if (s < 0) {
1280 // A capture group wasn't part of the match
1281 return utext_replaceutext_replace_71(dest, destLen, destLen, NULL__null, 0, &status);
1282 }
1283 U_ASSERT(s <= e)(void)0;
1284
1285 int64_t deltaLen;
1286 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1287 U_ASSERT(e <= fInputLength)(void)0;
1288 deltaLen = utext_replaceutext_replace_71(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1289 } else {
1290 int32_t len16;
1291 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
1292 len16 = (int32_t)(e-s);
1293 } else {
1294 UErrorCode lengthStatus = U_ZERO_ERROR;
1295 len16 = utext_extractutext_extract_71(fInputText, s, e, NULL__null, 0, &lengthStatus);
1296 }
1297 UChar *groupChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(len16+1));
1298 if (groupChars == NULL__null) {
1299 status = U_MEMORY_ALLOCATION_ERROR;
1300 return 0;
1301 }
1302 utext_extractutext_extract_71(fInputText, s, e, groupChars, len16+1, &status);
1303
1304 deltaLen = utext_replaceutext_replace_71(dest, destLen, destLen, groupChars, len16, &status);
1305 uprv_freeuprv_free_71(groupChars);
1306 }
1307 return deltaLen;
1308}
1309
1310
1311
1312//--------------------------------------------------------------------------------
1313//
1314// groupCount()
1315//
1316//--------------------------------------------------------------------------------
1317int32_t RegexMatcher::groupCount() const {
1318 return fPattern->fGroupMap->size();
1319}
1320
1321//--------------------------------------------------------------------------------
1322//
1323// hasAnchoringBounds()
1324//
1325//--------------------------------------------------------------------------------
1326UBool RegexMatcher::hasAnchoringBounds() const {
1327 return fAnchoringBounds;
1328}
1329
1330
1331//--------------------------------------------------------------------------------
1332//
1333// hasTransparentBounds()
1334//
1335//--------------------------------------------------------------------------------
1336UBool RegexMatcher::hasTransparentBounds() const {
1337 return fTransparentBounds;
1338}
1339
1340
1341
1342//--------------------------------------------------------------------------------
1343//
1344// hitEnd()
1345//
1346//--------------------------------------------------------------------------------
1347UBool RegexMatcher::hitEnd() const {
1348 return fHitEnd;
1349}
1350
1351
1352//--------------------------------------------------------------------------------
1353//
1354// input()
1355//
1356//--------------------------------------------------------------------------------
1357const UnicodeString &RegexMatcher::input() const {
1358 if (!fInput) {
1359 UErrorCode status = U_ZERO_ERROR;
1360 int32_t len16;
1361 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
1362 len16 = (int32_t)fInputLength;
1363 } else {
1364 len16 = utext_extractutext_extract_71(fInputText, 0, fInputLength, NULL__null, 0, &status);
1365 status = U_ZERO_ERROR; // overflow, length status
1366 }
1367 UnicodeString *result = new UnicodeString(len16, 0, 0);
1368
1369 UChar *inputChars = result->getBuffer(len16);
1370 utext_extractutext_extract_71(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1371 result->releaseBuffer(len16);
1372
1373 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1374 }
1375
1376 return *fInput;
1377}
1378
1379//--------------------------------------------------------------------------------
1380//
1381// inputText()
1382//
1383//--------------------------------------------------------------------------------
1384UText *RegexMatcher::inputText() const {
1385 return fInputText;
1386}
1387
1388
1389//--------------------------------------------------------------------------------
1390//
1391// getInput() -- like inputText(), but makes a clone or copies into another UText
1392//
1393//--------------------------------------------------------------------------------
1394UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1395 if (U_FAILURE(status)) {
1396 return dest;
1397 }
1398 if (U_FAILURE(fDeferredStatus)) {
1399 status = fDeferredStatus;
1400 return dest;
1401 }
1402
1403 if (dest) {
1404 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1405 utext_replaceutext_replace_71(dest, 0, utext_nativeLengthutext_nativeLength_71(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1406 } else {
1407 int32_t input16Len;
1408 if (UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
1409 input16Len = (int32_t)fInputLength;
1410 } else {
1411 UErrorCode lengthStatus = U_ZERO_ERROR;
1412 input16Len = utext_extractutext_extract_71(fInputText, 0, fInputLength, NULL__null, 0, &lengthStatus); // buffer overflow error
1413 }
1414 UChar *inputChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(input16Len));
1415 if (inputChars == NULL__null) {
1416 return dest;
1417 }
1418
1419 status = U_ZERO_ERROR;
1420 utext_extractutext_extract_71(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1421 status = U_ZERO_ERROR;
1422 utext_replaceutext_replace_71(dest, 0, utext_nativeLengthutext_nativeLength_71(dest), inputChars, input16Len, &status);
1423
1424 uprv_freeuprv_free_71(inputChars);
1425 }
1426 return dest;
1427 } else {
1428 return utext_cloneutext_clone_71(NULL__null, fInputText, FALSE0, TRUE1, &status);
1429 }
1430}
1431
1432
1433static UBool compat_SyncMutableUTextContents(UText *ut);
1434static UBool compat_SyncMutableUTextContents(UText *ut) {
1435 UBool retVal = FALSE0;
1436
1437 // In the following test, we're really only interested in whether the UText should switch
1438 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1439 // will still point to the correct data.
1440 if (utext_nativeLengthutext_nativeLength_71(ut) != ut->nativeIndexingLimit) {
1441 UnicodeString *us=(UnicodeString *)ut->context;
1442
1443 // Update to the latest length.
1444 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1445 int32_t newLength = us->length();
1446
1447 // Update the chunk description.
1448 // The buffer may have switched between stack- and heap-based.
1449 ut->chunkContents = us->getBuffer();
1450 ut->chunkLength = newLength;
1451 ut->chunkNativeLimit = newLength;
1452 ut->nativeIndexingLimit = newLength;
1453 retVal = TRUE1;
1454 }
1455
1456 return retVal;
1457}
1458
1459//--------------------------------------------------------------------------------
1460//
1461// lookingAt()
1462//
1463//--------------------------------------------------------------------------------
1464UBool RegexMatcher::lookingAt(UErrorCode &status) {
1465 if (U_FAILURE(status)) {
1466 return FALSE0;
1467 }
1468 if (U_FAILURE(fDeferredStatus)) {
1469 status = fDeferredStatus;
1470 return FALSE0;
1471 }
1472
1473 if (fInputUniStrMaybeMutable) {
1474 if (compat_SyncMutableUTextContents(fInputText)) {
1475 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1476 reset();
1477 }
1478 }
1479 else {
1480 resetPreserveRegion();
1481 }
1482 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1483 MatchChunkAt((int32_t)fActiveStart, FALSE0, status);
1484 } else {
1485 MatchAt(fActiveStart, FALSE0, status);
1486 }
1487 return fMatch;
1488}
1489
1490
1491UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1492 if (U_FAILURE(status)) {
1493 return FALSE0;
1494 }
1495 if (U_FAILURE(fDeferredStatus)) {
1496 status = fDeferredStatus;
1497 return FALSE0;
1498 }
1499 reset();
1500
1501 if (start < 0) {
1502 status = U_INDEX_OUTOFBOUNDS_ERROR;
1503 return FALSE0;
1504 }
1505
1506 if (fInputUniStrMaybeMutable) {
1507 if (compat_SyncMutableUTextContents(fInputText)) {
1508 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1509 reset();
1510 }
1511 }
1512
1513 int64_t nativeStart;
1514 nativeStart = start;
1515 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1516 status = U_INDEX_OUTOFBOUNDS_ERROR;
1517 return FALSE0;
1518 }
1519
1520 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1521 MatchChunkAt((int32_t)nativeStart, FALSE0, status);
1522 } else {
1523 MatchAt(nativeStart, FALSE0, status);
1524 }
1525 return fMatch;
1526}
1527
1528
1529
1530//--------------------------------------------------------------------------------
1531//
1532// matches()
1533//
1534//--------------------------------------------------------------------------------
1535UBool RegexMatcher::matches(UErrorCode &status) {
1536 if (U_FAILURE(status)) {
1537 return FALSE0;
1538 }
1539 if (U_FAILURE(fDeferredStatus)) {
1540 status = fDeferredStatus;
1541 return FALSE0;
1542 }
1543
1544 if (fInputUniStrMaybeMutable) {
1545 if (compat_SyncMutableUTextContents(fInputText)) {
1546 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1547 reset();
1548 }
1549 }
1550 else {
1551 resetPreserveRegion();
1552 }
1553
1554 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1555 MatchChunkAt((int32_t)fActiveStart, TRUE1, status);
1556 } else {
1557 MatchAt(fActiveStart, TRUE1, status);
1558 }
1559 return fMatch;
1560}
1561
1562
1563UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1564 if (U_FAILURE(status)) {
1565 return FALSE0;
1566 }
1567 if (U_FAILURE(fDeferredStatus)) {
1568 status = fDeferredStatus;
1569 return FALSE0;
1570 }
1571 reset();
1572
1573 if (start < 0) {
1574 status = U_INDEX_OUTOFBOUNDS_ERROR;
1575 return FALSE0;
1576 }
1577
1578 if (fInputUniStrMaybeMutable) {
1579 if (compat_SyncMutableUTextContents(fInputText)) {
1580 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1581 reset();
1582 }
1583 }
1584
1585 int64_t nativeStart;
1586 nativeStart = start;
1587 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1588 status = U_INDEX_OUTOFBOUNDS_ERROR;
1589 return FALSE0;
1590 }
1591
1592 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)((0==((fInputText)->chunkNativeStart))&&((fInputLength
)==((fInputText)->chunkNativeLimit))&&((fInputLength
)==((fInputText)->nativeIndexingLimit)))
) {
1593 MatchChunkAt((int32_t)nativeStart, TRUE1, status);
1594 } else {
1595 MatchAt(nativeStart, TRUE1, status);
1596 }
1597 return fMatch;
1598}
1599
1600
1601
1602//--------------------------------------------------------------------------------
1603//
1604// pattern
1605//
1606//--------------------------------------------------------------------------------
1607const RegexPattern &RegexMatcher::pattern() const {
1608 return *fPattern;
1609}
1610
1611
1612
1613//--------------------------------------------------------------------------------
1614//
1615// region
1616//
1617//--------------------------------------------------------------------------------
1618RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1619 if (U_FAILURE(status)) {
1620 return *this;
1621 }
1622
1623 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1624 status = U_ILLEGAL_ARGUMENT_ERROR;
1625 }
1626
1627 int64_t nativeStart = regionStart;
1628 int64_t nativeLimit = regionLimit;
1629 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1630 status = U_ILLEGAL_ARGUMENT_ERROR;
1631 }
1632
1633 if (startIndex == -1)
1634 this->reset();
1635 else
1636 resetPreserveRegion();
1637
1638 fRegionStart = nativeStart;
1639 fRegionLimit = nativeLimit;
1640 fActiveStart = nativeStart;
1641 fActiveLimit = nativeLimit;
1642
1643 if (startIndex != -1) {
1644 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1645 status = U_INDEX_OUTOFBOUNDS_ERROR;
1646 }
1647 fMatchEnd = startIndex;
1648 }
1649
1650 if (!fTransparentBounds) {
1651 fLookStart = nativeStart;
1652 fLookLimit = nativeLimit;
1653 }
1654 if (fAnchoringBounds) {
1655 fAnchorStart = nativeStart;
1656 fAnchorLimit = nativeLimit;
1657 }
1658 return *this;
1659}
1660
1661RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1662 return region(start, limit, -1, status);
1663}
1664
1665//--------------------------------------------------------------------------------
1666//
1667// regionEnd
1668//
1669//--------------------------------------------------------------------------------
1670int32_t RegexMatcher::regionEnd() const {
1671 return (int32_t)fRegionLimit;
1672}
1673
1674int64_t RegexMatcher::regionEnd64() const {
1675 return fRegionLimit;
1676}
1677
1678//--------------------------------------------------------------------------------
1679//
1680// regionStart
1681//
1682//--------------------------------------------------------------------------------
1683int32_t RegexMatcher::regionStart() const {
1684 return (int32_t)fRegionStart;
1685}
1686
1687int64_t RegexMatcher::regionStart64() const {
1688 return fRegionStart;
1689}
1690
1691
1692//--------------------------------------------------------------------------------
1693//
1694// replaceAll
1695//
1696//--------------------------------------------------------------------------------
1697UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1698 UText replacementText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1699 UText resultText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1700 UnicodeString resultString;
1701 if (U_FAILURE(status)) {
1702 return resultString;
1703 }
1704
1705 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&replacementText, &replacement, &status);
1706 utext_openUnicodeStringutext_openUnicodeString_71(&resultText, &resultString, &status);
1707
1708 replaceAll(&replacementText, &resultText, status);
1709
1710 utext_closeutext_close_71(&resultText);
1711 utext_closeutext_close_71(&replacementText);
1712
1713 return resultString;
1714}
1715
1716
1717//
1718// replaceAll, UText mode
1719//
1720UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1721 if (U_FAILURE(status)) {
1722 return dest;
1723 }
1724 if (U_FAILURE(fDeferredStatus)) {
1725 status = fDeferredStatus;
1726 return dest;
1727 }
1728
1729 if (dest == NULL__null) {
1730 UnicodeString emptyString;
1731 UText empty = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1732
1733 utext_openUnicodeStringutext_openUnicodeString_71(&empty, &emptyString, &status);
1734 dest = utext_cloneutext_clone_71(NULL__null, &empty, TRUE1, FALSE0, &status);
1735 utext_closeutext_close_71(&empty);
1736 }
1737
1738 if (U_SUCCESS(status)) {
1739 reset();
1740 while (find()) {
1741 appendReplacement(dest, replacement, status);
1742 if (U_FAILURE(status)) {
1743 break;
1744 }
1745 }
1746 appendTail(dest, status);
1747 }
1748
1749 return dest;
1750}
1751
1752
1753//--------------------------------------------------------------------------------
1754//
1755// replaceFirst
1756//
1757//--------------------------------------------------------------------------------
1758UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1759 UText replacementText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1760 UText resultText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1761 UnicodeString resultString;
1762
1763 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&replacementText, &replacement, &status);
1764 utext_openUnicodeStringutext_openUnicodeString_71(&resultText, &resultString, &status);
1765
1766 replaceFirst(&replacementText, &resultText, status);
1767
1768 utext_closeutext_close_71(&resultText);
1769 utext_closeutext_close_71(&replacementText);
1770
1771 return resultString;
1772}
1773
1774//
1775// replaceFirst, UText mode
1776//
1777UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1778 if (U_FAILURE(status)) {
1779 return dest;
1780 }
1781 if (U_FAILURE(fDeferredStatus)) {
1782 status = fDeferredStatus;
1783 return dest;
1784 }
1785
1786 reset();
1787 if (!find()) {
1788 return getInput(dest, status);
1789 }
1790
1791 if (dest == NULL__null) {
1792 UnicodeString emptyString;
1793 UText empty = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
1794
1795 utext_openUnicodeStringutext_openUnicodeString_71(&empty, &emptyString, &status);
1796 dest = utext_cloneutext_clone_71(NULL__null, &empty, TRUE1, FALSE0, &status);
1797 utext_closeutext_close_71(&empty);
1798 }
1799
1800 appendReplacement(dest, replacement, status);
1801 appendTail(dest, status);
1802
1803 return dest;
1804}
1805
1806
1807//--------------------------------------------------------------------------------
1808//
1809// requireEnd
1810//
1811//--------------------------------------------------------------------------------
1812UBool RegexMatcher::requireEnd() const {
1813 return fRequireEnd;
1814}
1815
1816
1817//--------------------------------------------------------------------------------
1818//
1819// reset
1820//
1821//--------------------------------------------------------------------------------
1822RegexMatcher &RegexMatcher::reset() {
1823 fRegionStart = 0;
1824 fRegionLimit = fInputLength;
1825 fActiveStart = 0;
1826 fActiveLimit = fInputLength;
1827 fAnchorStart = 0;
1828 fAnchorLimit = fInputLength;
1829 fLookStart = 0;
1830 fLookLimit = fInputLength;
1831 resetPreserveRegion();
1832 return *this;
1833}
1834
1835
1836
1837void RegexMatcher::resetPreserveRegion() {
1838 fMatchStart = 0;
1839 fMatchEnd = 0;
1840 fLastMatchEnd = -1;
1841 fAppendPosition = 0;
1842 fMatch = FALSE0;
1843 fHitEnd = FALSE0;
1844 fRequireEnd = FALSE0;
1845 fTime = 0;
1846 fTickCounter = TIMER_INITIAL_VALUE;
1847 //resetStack(); // more expensive than it looks...
1848}
1849
1850
1851RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1852 fInputText = utext_openConstUnicodeStringutext_openConstUnicodeString_71(fInputText, &input, &fDeferredStatus);
1853 if (fPattern->fNeedsAltInput) {
1854 fAltInputText = utext_cloneutext_clone_71(fAltInputText, fInputText, FALSE0, TRUE1, &fDeferredStatus);
1855 }
1856 if (U_FAILURE(fDeferredStatus)) {
1857 return *this;
1858 }
1859 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1860
1861 reset();
1862 delete fInput;
1863 fInput = NULL__null;
1864
1865 // Do the following for any UnicodeString.
1866 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1867 fInputUniStrMaybeMutable = TRUE1;
1868
1869#if UCONFIG_NO_BREAK_ITERATION0==0
1870 if (fWordBreakItr) {
1871 fWordBreakItr->setText(fInputText, fDeferredStatus);
1872 }
1873 if (fGCBreakItr) {
1874 fGCBreakItr->setText(fInputText, fDeferredStatus);
1875 }
1876#endif
1877
1878 return *this;
1879}
1880
1881
1882RegexMatcher &RegexMatcher::reset(UText *input) {
1883 if (fInputText != input) {
1884 fInputText = utext_cloneutext_clone_71(fInputText, input, FALSE0, TRUE1, &fDeferredStatus);
1885 if (fPattern->fNeedsAltInput) fAltInputText = utext_cloneutext_clone_71(fAltInputText, fInputText, FALSE0, TRUE1, &fDeferredStatus);
1886 if (U_FAILURE(fDeferredStatus)) {
1887 return *this;
1888 }
1889 fInputLength = utext_nativeLengthutext_nativeLength_71(fInputText);
1890
1891 delete fInput;
1892 fInput = NULL__null;
1893
1894#if UCONFIG_NO_BREAK_ITERATION0==0
1895 if (fWordBreakItr) {
1896 fWordBreakItr->setText(input, fDeferredStatus);
1897 }
1898 if (fGCBreakItr) {
1899 fGCBreakItr->setText(fInputText, fDeferredStatus);
1900 }
1901#endif
1902 }
1903 reset();
1904 fInputUniStrMaybeMutable = FALSE0;
1905
1906 return *this;
1907}
1908
1909/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1910 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1911 return *this;
1912}*/
1913
1914RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1915 if (U_FAILURE(status)) {
1916 return *this;
1917 }
1918 reset(); // Reset also resets the region to be the entire string.
1919
1920 if (position < 0 || position > fActiveLimit) {
1921 status = U_INDEX_OUTOFBOUNDS_ERROR;
1922 return *this;
1923 }
1924 fMatchEnd = position;
1925 return *this;
1926}
1927
1928
1929//--------------------------------------------------------------------------------
1930//
1931// refresh
1932//
1933//--------------------------------------------------------------------------------
1934RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1935 if (U_FAILURE(status)) {
1936 return *this;
1937 }
1938 if (input == NULL__null) {
1939 status = U_ILLEGAL_ARGUMENT_ERROR;
1940 return *this;
1941 }
1942 if (utext_nativeLengthutext_nativeLength_71(fInputText) != utext_nativeLengthutext_nativeLength_71(input)) {
1943 status = U_ILLEGAL_ARGUMENT_ERROR;
1944 return *this;
1945 }
1946 int64_t pos = utext_getNativeIndexutext_getNativeIndex_71(fInputText);
1947 // Shallow read-only clone of the new UText into the existing input UText
1948 fInputText = utext_cloneutext_clone_71(fInputText, input, FALSE0, TRUE1, &status);
1949 if (U_FAILURE(status)) {
1950 return *this;
1951 }
1952 utext_setNativeIndexutext_setNativeIndex_71(fInputText, pos);
1953
1954 if (fAltInputText != NULL__null) {
1955 pos = utext_getNativeIndexutext_getNativeIndex_71(fAltInputText);
1956 fAltInputText = utext_cloneutext_clone_71(fAltInputText, input, FALSE0, TRUE1, &status);
1957 if (U_FAILURE(status)) {
1958 return *this;
1959 }
1960 utext_setNativeIndexutext_setNativeIndex_71(fAltInputText, pos);
1961 }
1962 return *this;
1963}
1964
1965
1966
1967//--------------------------------------------------------------------------------
1968//
1969// setTrace
1970//
1971//--------------------------------------------------------------------------------
1972void RegexMatcher::setTrace(UBool state) {
1973 fTraceDebug = state;
1974}
1975
1976
1977
1978/**
1979 * UText, replace entire contents of the destination UText with a substring of the source UText.
1980 *
1981 * @param src The source UText
1982 * @param dest The destination UText. Must be writable.
1983 * May be NULL, in which case a new UText will be allocated.
1984 * @param start Start index of source substring.
1985 * @param limit Limit index of source substring.
1986 * @param status An error code.
1987 */
1988static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1989 if (U_FAILURE(*status)) {
1990 return dest;
1991 }
1992 if (start == limit) {
1993 if (dest) {
1994 utext_replaceutext_replace_71(dest, 0, utext_nativeLengthutext_nativeLength_71(dest), NULL__null, 0, status);
1995 return dest;
1996 } else {
1997 return utext_openUCharsutext_openUChars_71(NULL__null, NULL__null, 0, status);
1998 }
1999 }
2000 int32_t length = utext_extractutext_extract_71(src, start, limit, NULL__null, 0, status);
2001 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2002 return dest;
2003 }
2004 *status = U_ZERO_ERROR;
2005 MaybeStackArray<UChar, 40> buffer;
2006 if (length >= buffer.getCapacity()) {
2007 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2008 if (newBuf == NULL__null) {
2009 *status = U_MEMORY_ALLOCATION_ERROR;
2010 }
2011 }
2012 utext_extractutext_extract_71(src, start, limit, buffer.getAlias(), length+1, status);
2013 if (dest) {
2014 utext_replaceutext_replace_71(dest, 0, utext_nativeLengthutext_nativeLength_71(dest), buffer.getAlias(), length, status);
2015 return dest;
2016 }
2017
2018 // Caller did not provide a preexisting UText.
2019 // Open a new one, and have it adopt the text buffer storage.
2020 if (U_FAILURE(*status)) {
2021 return NULL__null;
2022 }
2023 int32_t ownedLength = 0;
2024 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2025 if (ownedBuf == NULL__null) {
2026 *status = U_MEMORY_ALLOCATION_ERROR;
2027 return NULL__null;
2028 }
2029 UText *result = utext_openUCharsutext_openUChars_71(NULL__null, ownedBuf, length, status);
2030 if (U_FAILURE(*status)) {
2031 uprv_freeuprv_free_71(ownedBuf);
2032 return NULL__null;
2033 }
2034 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2035 return result;
2036}
2037
2038
2039//---------------------------------------------------------------------
2040//
2041// split
2042//
2043//---------------------------------------------------------------------
2044int32_t RegexMatcher::split(const UnicodeString &input,
2045 UnicodeString dest[],
2046 int32_t destCapacity,
2047 UErrorCode &status)
2048{
2049 UText inputText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2050 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&inputText, &input, &status);
2051 if (U_FAILURE(status)) {
2052 return 0;
2053 }
2054
2055 UText **destText = (UText **)uprv_mallocuprv_malloc_71(sizeof(UText*)*destCapacity);
2056 if (destText == NULL__null) {
2057 status = U_MEMORY_ALLOCATION_ERROR;
2058 return 0;
2059 }
2060 int32_t i;
2061 for (i = 0; i < destCapacity; i++) {
2062 destText[i] = utext_openUnicodeStringutext_openUnicodeString_71(NULL__null, &dest[i], &status);
2063 }
2064
2065 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2066
2067 for (i = 0; i < destCapacity; i++) {
2068 utext_closeutext_close_71(destText[i]);
2069 }
2070
2071 uprv_freeuprv_free_71(destText);
2072 utext_closeutext_close_71(&inputText);
2073 return fieldCount;
2074}
2075
2076//
2077// split, UText mode
2078//
2079int32_t RegexMatcher::split(UText *input,
2080 UText *dest[],
2081 int32_t destCapacity,
2082 UErrorCode &status)
2083{
2084 //
2085 // Check arguments for validity
2086 //
2087 if (U_FAILURE(status)) {
2088 return 0;
2089 }
2090
2091 if (destCapacity < 1) {
2092 status = U_ILLEGAL_ARGUMENT_ERROR;
2093 return 0;
2094 }
2095
2096 //
2097 // Reset for the input text
2098 //
2099 reset(input);
2100 int64_t nextOutputStringStart = 0;
2101 if (fActiveLimit == 0) {
2102 return 0;
2103 }
2104
2105 //
2106 // Loop through the input text, searching for the delimiter pattern
2107 //
2108 int32_t i;
2109 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2110 for (i=0; ; i++) {
2111 if (i>=destCapacity-1) {
2112 // There is one or zero output string left.
2113 // Fill the last output string with whatever is left from the input, then exit the loop.
2114 // ( i will be == destCapacity if we filled the output array while processing
2115 // capture groups of the delimiter expression, in which case we will discard the
2116 // last capture group saved in favor of the unprocessed remainder of the
2117 // input string.)
2118 i = destCapacity-1;
2119 if (fActiveLimit > nextOutputStringStart) {
2120 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)((0==((input)->chunkNativeStart))&&((fInputLength)
==((input)->chunkNativeLimit))&&((fInputLength)==(
(input)->nativeIndexingLimit)))
) {
2121 if (dest[i]) {
2122 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]),
2123 input->chunkContents+nextOutputStringStart,
2124 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2125 } else {
2126 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2127 utext_openUCharsutext_openUChars_71(&remainingText, input->chunkContents+nextOutputStringStart,
2128 fActiveLimit-nextOutputStringStart, &status);
2129 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2130 utext_closeutext_close_71(&remainingText);
2131 }
2132 } else {
2133 UErrorCode lengthStatus = U_ZERO_ERROR;
2134 int32_t remaining16Length =
2135 utext_extractutext_extract_71(input, nextOutputStringStart, fActiveLimit, NULL__null, 0, &lengthStatus);
2136 UChar *remainingChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(remaining16Length+1));
2137 if (remainingChars == NULL__null) {
2138 status = U_MEMORY_ALLOCATION_ERROR;
2139 break;
2140 }
2141
2142 utext_extractutext_extract_71(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2143 if (dest[i]) {
2144 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]), remainingChars, remaining16Length, &status);
2145 } else {
2146 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2147 utext_openUCharsutext_openUChars_71(&remainingText, remainingChars, remaining16Length, &status);
2148 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2149 utext_closeutext_close_71(&remainingText);
2150 }
2151
2152 uprv_freeuprv_free_71(remainingChars);
2153 }
2154 }
2155 break;
2156 }
2157 if (find()) {
2158 // We found another delimiter. Move everything from where we started looking
2159 // up until the start of the delimiter into the next output string.
2160 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)((0==((input)->chunkNativeStart))&&((fInputLength)
==((input)->chunkNativeLimit))&&((fInputLength)==(
(input)->nativeIndexingLimit)))
) {
2161 if (dest[i]) {
2162 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]),
2163 input->chunkContents+nextOutputStringStart,
2164 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2165 } else {
2166 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2167 utext_openUCharsutext_openUChars_71(&remainingText, input->chunkContents+nextOutputStringStart,
2168 fMatchStart-nextOutputStringStart, &status);
2169 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2170 utext_closeutext_close_71(&remainingText);
2171 }
2172 } else {
2173 UErrorCode lengthStatus = U_ZERO_ERROR;
2174 int32_t remaining16Length = utext_extractutext_extract_71(input, nextOutputStringStart, fMatchStart, NULL__null, 0, &lengthStatus);
2175 UChar *remainingChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(remaining16Length+1));
2176 if (remainingChars == NULL__null) {
2177 status = U_MEMORY_ALLOCATION_ERROR;
2178 break;
2179 }
2180 utext_extractutext_extract_71(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2181 if (dest[i]) {
2182 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]), remainingChars, remaining16Length, &status);
2183 } else {
2184 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2185 utext_openUCharsutext_openUChars_71(&remainingText, remainingChars, remaining16Length, &status);
2186 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2187 utext_closeutext_close_71(&remainingText);
2188 }
2189
2190 uprv_freeuprv_free_71(remainingChars);
2191 }
2192 nextOutputStringStart = fMatchEnd;
2193
2194 // If the delimiter pattern has capturing parentheses, the captured
2195 // text goes out into the next n destination strings.
2196 int32_t groupNum;
2197 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2198 if (i >= destCapacity-2) {
2199 // Never fill the last available output string with capture group text.
2200 // It will filled with the last field, the remainder of the
2201 // unsplit input text.
2202 break;
2203 }
2204 i++;
2205 dest[i] = utext_extract_replace(fInputText, dest[i],
2206 start64(groupNum, status), end64(groupNum, status), &status);
2207 }
2208
2209 if (nextOutputStringStart == fActiveLimit) {
2210 // The delimiter was at the end of the string. We're done, but first
2211 // we output one last empty string, for the empty field following
2212 // the delimiter at the end of input.
2213 if (i+1 < destCapacity) {
2214 ++i;
2215 if (dest[i] == NULL__null) {
2216 dest[i] = utext_openUCharsutext_openUChars_71(NULL__null, NULL__null, 0, &status);
2217 } else {
2218 static const UChar emptyString[] = {(UChar)0};
2219 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]), emptyString, 0, &status);
2220 }
2221 }
2222 break;
2223
2224 }
2225 }
2226 else
2227 {
2228 // We ran off the end of the input while looking for the next delimiter.
2229 // All the remaining text goes into the current output string.
2230 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)((0==((input)->chunkNativeStart))&&((fInputLength)
==((input)->chunkNativeLimit))&&((fInputLength)==(
(input)->nativeIndexingLimit)))
) {
2231 if (dest[i]) {
2232 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]),
2233 input->chunkContents+nextOutputStringStart,
2234 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2235 } else {
2236 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2237 utext_openUCharsutext_openUChars_71(&remainingText, input->chunkContents+nextOutputStringStart,
2238 fActiveLimit-nextOutputStringStart, &status);
2239 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2240 utext_closeutext_close_71(&remainingText);
2241 }
2242 } else {
2243 UErrorCode lengthStatus = U_ZERO_ERROR;
2244 int32_t remaining16Length = utext_extractutext_extract_71(input, nextOutputStringStart, fActiveLimit, NULL__null, 0, &lengthStatus);
2245 UChar *remainingChars = (UChar *)uprv_mallocuprv_malloc_71(sizeof(UChar)*(remaining16Length+1));
2246 if (remainingChars == NULL__null) {
2247 status = U_MEMORY_ALLOCATION_ERROR;
2248 break;
2249 }
2250
2251 utext_extractutext_extract_71(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2252 if (dest[i]) {
2253 utext_replaceutext_replace_71(dest[i], 0, utext_nativeLengthutext_nativeLength_71(dest[i]), remainingChars, remaining16Length, &status);
2254 } else {
2255 UText remainingText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
2256 utext_openUCharsutext_openUChars_71(&remainingText, remainingChars, remaining16Length, &status);
2257 dest[i] = utext_cloneutext_clone_71(NULL__null, &remainingText, TRUE1, FALSE0, &status);
2258 utext_closeutext_close_71(&remainingText);
2259 }
2260
2261 uprv_freeuprv_free_71(remainingChars);
2262 }
2263 break;
2264 }
2265 if (U_FAILURE(status)) {
2266 break;
2267 }
2268 } // end of for loop
2269 return i+1;
2270}
2271
2272
2273//--------------------------------------------------------------------------------
2274//
2275// start
2276//
2277//--------------------------------------------------------------------------------
2278int32_t RegexMatcher::start(UErrorCode &status) const {
2279 return start(0, status);
2280}
2281
2282int64_t RegexMatcher::start64(UErrorCode &status) const {
2283 return start64(0, status);
2284}
2285
2286//--------------------------------------------------------------------------------
2287//
2288// start(int32_t group, UErrorCode &status)
2289//
2290//--------------------------------------------------------------------------------
2291
2292int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2293 if (U_FAILURE(status)) {
2294 return -1;
2295 }
2296 if (U_FAILURE(fDeferredStatus)) {
2297 status = fDeferredStatus;
2298 return -1;
2299 }
2300 if (fMatch == FALSE0) {
2301 status = U_REGEX_INVALID_STATE;
2302 return -1;
2303 }
2304 if (group < 0 || group > fPattern->fGroupMap->size()) {
2305 status = U_INDEX_OUTOFBOUNDS_ERROR;
2306 return -1;
2307 }
2308 int64_t s;
2309 if (group == 0) {
2310 s = fMatchStart;
2311 } else {
2312 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2313 U_ASSERT(groupOffset < fPattern->fFrameSize)(void)0;
2314 U_ASSERT(groupOffset >= 0)(void)0;
2315 s = fFrame->fExtra[groupOffset];
2316 }
2317
2318 return s;
2319}
2320
2321
2322int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2323 return (int32_t)start64(group, status);
2324}
2325
2326//--------------------------------------------------------------------------------
2327//
2328// useAnchoringBounds
2329//
2330//--------------------------------------------------------------------------------
2331RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2332 fAnchoringBounds = b;
2333 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2334 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2335 return *this;
2336}
2337
2338
2339//--------------------------------------------------------------------------------
2340//
2341// useTransparentBounds
2342//
2343//--------------------------------------------------------------------------------
2344RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2345 fTransparentBounds = b;
2346 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2347 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2348 return *this;
2349}
2350
2351//--------------------------------------------------------------------------------
2352//
2353// setTimeLimit
2354//
2355//--------------------------------------------------------------------------------
2356void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2357 if (U_FAILURE(status)) {
2358 return;
2359 }
2360 if (U_FAILURE(fDeferredStatus)) {
2361 status = fDeferredStatus;
2362 return;
2363 }
2364 if (limit < 0) {
2365 status = U_ILLEGAL_ARGUMENT_ERROR;
2366 return;
2367 }
2368 fTimeLimit = limit;
2369}
2370
2371
2372//--------------------------------------------------------------------------------
2373//
2374// getTimeLimit
2375//
2376//--------------------------------------------------------------------------------
2377int32_t RegexMatcher::getTimeLimit() const {
2378 return fTimeLimit;
2379}
2380
2381
2382//--------------------------------------------------------------------------------
2383//
2384// setStackLimit
2385//
2386//--------------------------------------------------------------------------------
2387void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2388 if (U_FAILURE(status)) {
2389 return;
2390 }
2391 if (U_FAILURE(fDeferredStatus)) {
2392 status = fDeferredStatus;
2393 return;
2394 }
2395 if (limit < 0) {
2396 status = U_ILLEGAL_ARGUMENT_ERROR;
2397 return;
2398 }
2399
2400 // Reset the matcher. This is needed here in case there is a current match
2401 // whose final stack frame (containing the match results, pointed to by fFrame)
2402 // would be lost by resizing to a smaller stack size.
2403 reset();
2404
2405 if (limit == 0) {
2406 // Unlimited stack expansion
2407 fStack->setMaxCapacity(0);
2408 } else {
2409 // Change the units of the limit from bytes to ints, and bump the size up
2410 // to be big enough to hold at least one stack frame for the pattern,
2411 // if it isn't there already.
2412 int32_t adjustedLimit = limit / sizeof(int32_t);
2413 if (adjustedLimit < fPattern->fFrameSize) {
2414 adjustedLimit = fPattern->fFrameSize;
2415 }
2416 fStack->setMaxCapacity(adjustedLimit);
2417 }
2418 fStackLimit = limit;
2419}
2420
2421
2422//--------------------------------------------------------------------------------
2423//
2424// getStackLimit
2425//
2426//--------------------------------------------------------------------------------
2427int32_t RegexMatcher::getStackLimit() const {
2428 return fStackLimit;
2429}
2430
2431
2432//--------------------------------------------------------------------------------
2433//
2434// setMatchCallback
2435//
2436//--------------------------------------------------------------------------------
2437void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2438 const void *context,
2439 UErrorCode &status) {
2440 if (U_FAILURE(status)) {
2441 return;
2442 }
2443 fCallbackFn = callback;
2444 fCallbackContext = context;
2445}
2446
2447
2448//--------------------------------------------------------------------------------
2449//
2450// getMatchCallback
2451//
2452//--------------------------------------------------------------------------------
2453void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2454 const void *&context,
2455 UErrorCode &status) {
2456 if (U_FAILURE(status)) {
2457 return;
2458 }
2459 callback = fCallbackFn;
2460 context = fCallbackContext;
2461}
2462
2463
2464//--------------------------------------------------------------------------------
2465//
2466// setMatchCallback
2467//
2468//--------------------------------------------------------------------------------
2469void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2470 const void *context,
2471 UErrorCode &status) {
2472 if (U_FAILURE(status)) {
2473 return;
2474 }
2475 fFindProgressCallbackFn = callback;
2476 fFindProgressCallbackContext = context;
2477}
2478
2479
2480//--------------------------------------------------------------------------------
2481//
2482// getMatchCallback
2483//
2484//--------------------------------------------------------------------------------
2485void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2486 const void *&context,
2487 UErrorCode &status) {
2488 if (U_FAILURE(status)) {
2489 return;
2490 }
2491 callback = fFindProgressCallbackFn;
2492 context = fFindProgressCallbackContext;
2493}
2494
2495
2496//================================================================================
2497//
2498// Code following this point in this file is the internal
2499// Match Engine Implementation.
2500//
2501//================================================================================
2502
2503
2504//--------------------------------------------------------------------------------
2505//
2506// resetStack
2507// Discard any previous contents of the state save stack, and initialize a
2508// new stack frame to all -1. The -1s are needed for capture group limits,
2509// where they indicate that a group has not yet matched anything.
2510//--------------------------------------------------------------------------------
2511REStackFrame *RegexMatcher::resetStack() {
2512 // Discard any previous contents of the state save stack, and initialize a
2513 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2514 // where they indicate that a group has not yet matched anything.
2515 fStack->removeAllElements();
2516
2517 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2518 if(U_FAILURE(fDeferredStatus)) {
2519 return NULL__null;
2520 }
2521
2522 int32_t i;
2523 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT2; i++) {
2524 iFrame->fExtra[i] = -1;
2525 }
2526 return iFrame;
2527}
2528
2529
2530
2531//--------------------------------------------------------------------------------
2532//
2533// isWordBoundary
2534// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2535// For us,
2536// If the current char is a combining mark,
2537// \b is FALSE.
2538// Else Scan backwards to the first non-combining char.
2539// We are at a boundary if the this char and the original chars are
2540// opposite in membership in \w set
2541//
2542// parameters: pos - the current position in the input buffer
2543//
2544// TODO: double-check edge cases at region boundaries.
2545//
2546//--------------------------------------------------------------------------------
2547UBool RegexMatcher::isWordBoundary(int64_t pos) {
2548 UBool isBoundary = FALSE0;
2549 UBool cIsWord = FALSE0;
2550
2551 if (pos >= fLookLimit) {
2552 fHitEnd = TRUE1;
2553 } else {
2554 // Determine whether char c at current position is a member of the word set of chars.
2555 // If we're off the end of the string, behave as though we're not at a word char.
2556 UTEXT_SETNATIVEINDEX(fInputText, pos)do { int64_t __offset = (pos) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (pos
)); } } while (false)
;
2557 UChar32 c = UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
;
2558 if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_GRAPHEME_EXTEND) || u_charTypeu_charType_71(c) == U_FORMAT_CHAR) {
2559 // Current char is a combining one. Not a boundary.
2560 return FALSE0;
2561 }
2562 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2563 }
2564
2565 // Back up until we come to a non-combining char, determine whether
2566 // that char is a word char.
2567 UBool prevCIsWord = FALSE0;
2568 for (;;) {
2569 if (UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
<= fLookStart) {
2570 break;
2571 }
2572 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
2573 if (!(u_hasBinaryPropertyu_hasBinaryProperty_71(prevChar, UCHAR_GRAPHEME_EXTEND)
2574 || u_charTypeu_charType_71(prevChar) == U_FORMAT_CHAR)) {
2575 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2576 break;
2577 }
2578 }
2579 isBoundary = cIsWord ^ prevCIsWord;
2580 return isBoundary;
2581}
2582
2583UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2584 UBool isBoundary = FALSE0;
2585 UBool cIsWord = FALSE0;
2586
2587 const UChar *inputBuf = fInputText->chunkContents;
2588
2589 if (pos >= fLookLimit) {
2590 fHitEnd = TRUE1;
2591 } else {
2592 // Determine whether char c at current position is a member of the word set of chars.
2593 // If we're off the end of the string, behave as though we're not at a word char.
2594 UChar32 c;
2595 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c)do { (c)=(inputBuf)[pos]; if((((c)&0xfffff800)==0xd800)) {
uint16_t __c2; if((((c)&0x400)==0)) { if((pos)+1!=(fLookLimit
) && (((__c2=(inputBuf)[(pos)+1])&0xfffffc00)==0xdc00
)) { (c)=(((UChar32)((c))<<10UL)+(UChar32)(__c2)-((0xd800
<<10UL)+0xdc00-0x10000)); } } else { if((pos)>(fLookStart
) && (((__c2=(inputBuf)[(pos)-1])&0xfffffc00)==0xd800
)) { (c)=(((UChar32)(__c2)<<10UL)+(UChar32)((c))-((0xd800
<<10UL)+0xdc00-0x10000)); } } } } while (false)
;
2596 if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_GRAPHEME_EXTEND) || u_charTypeu_charType_71(c) == U_FORMAT_CHAR) {
2597 // Current char is a combining one. Not a boundary.
2598 return FALSE0;
2599 }
2600 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2601 }
2602
2603 // Back up until we come to a non-combining char, determine whether
2604 // that char is a word char.
2605 UBool prevCIsWord = FALSE0;
2606 for (;;) {
2607 if (pos <= fLookStart) {
2608 break;
2609 }
2610 UChar32 prevChar;
2611 U16_PREV(inputBuf, fLookStart, pos, prevChar)do { (prevChar)=(inputBuf)[--(pos)]; if((((prevChar)&0xfffffc00
)==0xdc00)) { uint16_t __c2; if((pos)>(fLookStart) &&
(((__c2=(inputBuf)[(pos)-1])&0xfffffc00)==0xd800)) { --(
pos); (prevChar)=(((UChar32)(__c2)<<10UL)+(UChar32)((prevChar
))-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (false
)
;
2612 if (!(u_hasBinaryPropertyu_hasBinaryProperty_71(prevChar, UCHAR_GRAPHEME_EXTEND)
2613 || u_charTypeu_charType_71(prevChar) == U_FORMAT_CHAR)) {
2614 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2615 break;
2616 }
2617 }
2618 isBoundary = cIsWord ^ prevCIsWord;
2619 return isBoundary;
2620}
2621
2622//--------------------------------------------------------------------------------
2623//
2624// isUWordBoundary
2625//
2626// Test for a word boundary using RBBI word break.
2627//
2628// parameters: pos - the current position in the input buffer
2629//
2630//--------------------------------------------------------------------------------
2631UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2632 UBool returnVal = FALSE0;
2633
2634#if UCONFIG_NO_BREAK_ITERATION0==0
2635 // Note: this point will never be reached if break iteration is configured out.
2636 // Regex patterns that would require this function will fail to compile.
2637
2638 // If we haven't yet created a break iterator for this matcher, do it now.
2639 if (fWordBreakItr == nullptr) {
2640 fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2641 if (U_FAILURE(status)) {
2642 return FALSE0;
2643 }
2644 fWordBreakItr->setText(fInputText, status);
2645 }
2646
2647 // Note: zero width boundary tests like \b see through transparent region bounds,
2648 // which is why fLookLimit is used here, rather than fActiveLimit.
2649 if (pos >= fLookLimit) {
2650 fHitEnd = TRUE1;
2651 returnVal = TRUE1; // With Unicode word rules, only positions within the interior of "real"
2652 // words are not boundaries. All non-word chars stand by themselves,
2653 // with word boundaries on both sides.
2654 } else {
2655 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2656 }
2657#endif
2658 return returnVal;
2659}
2660
2661
2662int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2663 int64_t result = pos;
2664
2665#if UCONFIG_NO_BREAK_ITERATION0==0
2666 // Note: this point will never be reached if break iteration is configured out.
2667 // Regex patterns that would require this function will fail to compile.
2668
2669 // If we haven't yet created a break iterator for this matcher, do it now.
2670 if (fGCBreakItr == nullptr) {
2671 fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2672 if (U_FAILURE(status)) {
2673 return pos;
2674 }
2675 fGCBreakItr->setText(fInputText, status);
2676 }
2677 result = fGCBreakItr->following(pos);
2678 if (result == BreakIterator::DONE) {
2679 result = pos;
2680 }
2681#endif
2682 return result;
2683}
2684
2685//--------------------------------------------------------------------------------
2686//
2687// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2688// saves. Increment the "time" counter, and call the
2689// user callback function if there is one installed.
2690//
2691// If the match operation needs to be aborted, either for a time-out
2692// or because the user callback asked for it, just set an error status.
2693// The engine will pick that up and stop in its outer loop.
2694//
2695//--------------------------------------------------------------------------------
2696void RegexMatcher::IncrementTime(UErrorCode &status) {
2697 fTickCounter = TIMER_INITIAL_VALUE;
2698 fTime++;
2699 if (fCallbackFn != NULL__null) {
2700 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE0) {
2701 status = U_REGEX_STOPPED_BY_CALLER;
2702 return;
2703 }
2704 }
2705 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2706 status = U_REGEX_TIME_OUT;
2707 }
2708}
2709
2710//--------------------------------------------------------------------------------
2711//
2712// StateSave
2713// Make a new stack frame, initialized as a copy of the current stack frame.
2714// Set the pattern index in the original stack frame from the operand value
2715// in the opcode. Execution of the engine continues with the state in
2716// the newly created stack frame
2717//
2718// Note that reserveBlock() may grow the stack, resulting in the
2719// whole thing being relocated in memory.
2720//
2721// Parameters:
2722// fp The top frame pointer when called. At return, a new
2723// fame will be present
2724// savePatIdx An index into the compiled pattern. Goes into the original
2725// (not new) frame. If execution ever back-tracks out of the
2726// new frame, this will be where we continue from in the pattern.
2727// Return
2728// The new frame pointer.
2729//
2730//--------------------------------------------------------------------------------
2731inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2732 if (U_FAILURE(status)) {
2733 return fp;
2734 }
2735 // push storage for a new frame.
2736 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2737 if (U_FAILURE(status)) {
2738 // Failure on attempted stack expansion.
2739 // Stack function set some other error code, change it to a more
2740 // specific one for regular expressions.
2741 status = U_REGEX_STACK_OVERFLOW;
2742 // We need to return a writable stack frame, so just return the
2743 // previous frame. The match operation will stop quickly
2744 // because of the error status, after which the frame will never
2745 // be looked at again.
2746 return fp;
2747 }
2748 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2749
2750 // New stack frame = copy of old top frame.
2751 int64_t *source = (int64_t *)fp;
2752 int64_t *dest = newFP;
2753 for (;;) {
2754 *dest++ = *source++;
2755 if (source == newFP) {
2756 break;
2757 }
2758 }
2759
2760 fTickCounter--;
2761 if (fTickCounter <= 0) {
2762 IncrementTime(status); // Re-initializes fTickCounter
2763 }
2764 fp->fPatIdx = savePatIdx;
2765 return (REStackFrame *)newFP;
2766}
2767
2768#if defined(REGEX_DEBUG)
2769namespace {
2770UnicodeString StringFromUText(UText *ut) {
2771 UnicodeString result;
2772 for (UChar32 c = utext_next32Fromutext_next32From_71(ut, 0); c != U_SENTINEL(-1); c = UTEXT_NEXT32(ut)((ut)->chunkOffset < (ut)->chunkLength && ((
ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? ((ut
)->chunkContents)[((ut)->chunkOffset)++] : utext_next32_71
(ut))
) {
2773 result.append(c);
2774 }
2775 return result;
2776}
2777}
2778#endif // REGEX_DEBUG
2779
2780
2781//--------------------------------------------------------------------------------
2782//
2783// MatchAt This is the actual matching engine.
2784//
2785// startIdx: begin matching a this index.
2786// toEnd: if true, match must extend to end of the input region
2787//
2788//--------------------------------------------------------------------------------
2789void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2790 UBool isMatch = FALSE0; // True if the we have a match.
2791
2792 int64_t backSearchIndex = U_INT64_MAX((int64_t)(9223372036854775807L)); // used after greedy single-character matches for searching backwards
2793
2794 int32_t op; // Operation from the compiled pattern, split into
2795 int32_t opType; // the opcode
2796 int32_t opValue; // and the operand value.
2797
2798#ifdef REGEX_RUN_DEBUG
2799 if (fTraceDebug) {
2800 printf("MatchAt(startIdx=%ld)\n", startIdx);
2801 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2802 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2803 }
2804#endif
2805
2806 if (U_FAILURE(status)) {
2807 return;
2808 }
2809
2810 // Cache frequently referenced items from the compiled pattern
2811 //
2812 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2813
2814 const UChar *litText = fPattern->fLiteralText.getBuffer();
2815 UVector *fSets = fPattern->fSets;
2816
2817 fFrameSize = fPattern->fFrameSize;
2818 REStackFrame *fp = resetStack();
2819 if (U_FAILURE(fDeferredStatus)) {
2820 status = fDeferredStatus;
2821 return;
2822 }
2823
2824 fp->fPatIdx = 0;
2825 fp->fInputIdx = startIdx;
2826
2827 // Zero out the pattern's static data
2828 int32_t i;
2829 for (i = 0; i<fPattern->fDataSize; i++) {
2830 fData[i] = 0;
2831 }
2832
2833 //
2834 // Main loop for interpreting the compiled pattern.
2835 // One iteration of the loop per pattern operation performed.
2836 //
2837 for (;;) {
2838 op = (int32_t)pat[fp->fPatIdx];
2839 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2840 opValue = URX_VAL(op)((op) & 0xffffff);
2841#ifdef REGEX_RUN_DEBUG
2842 if (fTraceDebug) {
2843 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
2844 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2845 UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
, (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2846 fPattern->dumpOp(fp->fPatIdx);
2847 }
2848#endif
2849 fp->fPatIdx++;
2850
2851 switch (opType) {
2852
2853
2854 case URX_NOP:
2855 break;
2856
2857
2858 case URX_BACKTRACK:
2859 // Force a backtrack. In some circumstances, the pattern compiler
2860 // will notice that the pattern can't possibly match anything, and will
2861 // emit one of these at that point.
2862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2863 break;
2864
2865
2866 case URX_ONECHAR:
2867 if (fp->fInputIdx < fActiveLimit) {
2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
2869 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
2870 if (c == opValue) {
2871 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
2872 break;
2873 }
2874 } else {
2875 fHitEnd = TRUE1;
2876 }
2877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878 break;
2879
2880
2881 case URX_STRING:
2882 {
2883 // Test input against a literal string.
2884 // Strings require two slots in the compiled pattern, one for the
2885 // offset to the string text, and one for the length.
2886
2887 int32_t stringStartIdx = opValue;
2888 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2889 fp->fPatIdx++;
2890 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2891 int32_t stringLen = URX_VAL(op)((op) & 0xffffff);
2892 U_ASSERT(opType == URX_STRING_LEN)(void)0;
2893 U_ASSERT(stringLen >= 2)(void)0;
2894
2895 const UChar *patternString = litText+stringStartIdx;
2896 int32_t patternStringIndex = 0;
2897 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
2898 UChar32 inputChar;
2899 UChar32 patternChar;
2900 UBool success = TRUE1;
2901 while (patternStringIndex < stringLen) {
2902 if (UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
>= fActiveLimit) {
2903 success = FALSE0;
2904 fHitEnd = TRUE1;
2905 break;
2906 }
2907 inputChar = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
2908 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar)do { (patternChar)=(patternString)[(patternStringIndex)++]; if
((((patternChar)&0xfffffc00)==0xd800)) { uint16_t __c2; if
((patternStringIndex)!=(stringLen) && (((__c2=(patternString
)[(patternStringIndex)])&0xfffffc00)==0xdc00)) { ++(patternStringIndex
); (patternChar)=(((UChar32)((patternChar))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false)
;
2909 if (patternChar != inputChar) {
2910 success = FALSE0;
2911 break;
2912 }
2913 }
2914
2915 if (success) {
2916 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
2917 } else {
2918 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2919 }
2920 }
2921 break;
2922
2923
2924 case URX_STATE_SAVE:
2925 fp = StateSave(fp, opValue, status);
2926 break;
2927
2928
2929 case URX_END:
2930 // The match loop will exit via this path on a successful match,
2931 // when we reach the end of the pattern.
2932 if (toEnd && fp->fInputIdx != fActiveLimit) {
2933 // The pattern matched, but not to the end of input. Try some more.
2934 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2935 break;
2936 }
2937 isMatch = TRUE1;
2938 goto breakFromLoop;
2939
2940 // Start and End Capture stack frame variables are laid out out like this:
2941 // fp->fExtra[opValue] - The start of a completed capture group
2942 // opValue+1 - The end of a completed capture group
2943 // opValue+2 - the start of a capture group whose end
2944 // has not yet been reached (and might not ever be).
2945 case URX_START_CAPTURE:
2946 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3)(void)0;
2947 fp->fExtra[opValue+2] = fp->fInputIdx;
2948 break;
2949
2950
2951 case URX_END_CAPTURE:
2952 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3)(void)0;
2953 U_ASSERT(fp->fExtra[opValue+2] >= 0)(void)0; // Start pos for this group must be set.
2954 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2955 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2956 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1])(void)0;
2957 break;
2958
2959
2960 case URX_DOLLAR: // $, test for End of line
2961 // or for position before new line at end of input
2962 {
2963 if (fp->fInputIdx >= fAnchorLimit) {
2964 // We really are at the end of input. Success.
2965 fHitEnd = TRUE1;
2966 fRequireEnd = TRUE1;
2967 break;
2968 }
2969
2970 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
2971
2972 // If we are positioned just before a new-line that is located at the
2973 // end of input, succeed.
2974 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
2975 if (UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
>= fAnchorLimit) {
2976 if (isLineTerminator(c)) {
2977 // If not in the middle of a CR/LF sequence
2978 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
, UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
)==0x0d)) {
2979 // At new-line at end of input. Success
2980 fHitEnd = TRUE1;
2981 fRequireEnd = TRUE1;
2982
2983 break;
2984 }
2985 }
2986 } else {
2987 UChar32 nextC = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
2988 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
>= fAnchorLimit) {
2989 fHitEnd = TRUE1;
2990 fRequireEnd = TRUE1;
2991 break; // At CR/LF at end of input. Success
2992 }
2993 }
2994
2995 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2996 }
2997 break;
2998
2999
3000 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
3001 if (fp->fInputIdx >= fAnchorLimit) {
3002 // Off the end of input. Success.
3003 fHitEnd = TRUE1;
3004 fRequireEnd = TRUE1;
3005 break;
3006 } else {
3007 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3008 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3009 // Either at the last character of input, or off the end.
3010 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
== fAnchorLimit) {
3011 fHitEnd = TRUE1;
3012 fRequireEnd = TRUE1;
3013 break;
3014 }
3015 }
3016
3017 // Not at end of input. Back-track out.
3018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3019 break;
3020
3021
3022 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3023 {
3024 if (fp->fInputIdx >= fAnchorLimit) {
3025 // We really are at the end of input. Success.
3026 fHitEnd = TRUE1;
3027 fRequireEnd = TRUE1;
3028 break;
3029 }
3030 // If we are positioned just before a new-line, succeed.
3031 // It makes no difference where the new-line is within the input.
3032 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3033 UChar32 c = UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
;
3034 if (isLineTerminator(c)) {
3035 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3036 // In multi-line mode, hitting a new-line just before the end of input does not
3037 // set the hitEnd or requireEnd flags
3038 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
==0x0d)) {
3039 break;
3040 }
3041 }
3042 // not at a new line. Fail.
3043 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044 }
3045 break;
3046
3047
3048 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3049 {
3050 if (fp->fInputIdx >= fAnchorLimit) {
3051 // We really are at the end of input. Success.
3052 fHitEnd = TRUE1;
3053 fRequireEnd = TRUE1; // Java set requireEnd in this case, even though
3054 break; // adding a new-line would not lose the match.
3055 }
3056 // If we are not positioned just before a new-line, the test fails; backtrack out.
3057 // It makes no difference where the new-line is within the input.
3058 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3059 if (UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
!= 0x0a) {
3060 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061 }
3062 }
3063 break;
3064
3065
3066 case URX_CARET: // ^, test for start of line
3067 if (fp->fInputIdx != fAnchorStart) {
3068 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3069 }
3070 break;
3071
3072
3073 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3074 {
3075 if (fp->fInputIdx == fAnchorStart) {
3076 // We are at the start input. Success.
3077 break;
3078 }
3079 // Check whether character just before the current pos is a new-line
3080 // unless we are at the end of input
3081 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3082 UChar32 c = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
3083 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3084 // It's a new-line. ^ is true. Success.
3085 // TODO: what should be done with positions between a CR and LF?
3086 break;
3087 }
3088 // Not at the start of a line. Fail.
3089 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3090 }
3091 break;
3092
3093
3094 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3095 {
3096 U_ASSERT(fp->fInputIdx >= fAnchorStart)(void)0;
3097 if (fp->fInputIdx <= fAnchorStart) {
3098 // We are at the start input. Success.
3099 break;
3100 }
3101 // Check whether character just before the current pos is a new-line
3102 U_ASSERT(fp->fInputIdx <= fAnchorLimit)(void)0;
3103 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3104 UChar32 c = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
3105 if (c != 0x0a) {
3106 // Not at the start of a line. Back-track out.
3107 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108 }
3109 }
3110 break;
3111
3112 case URX_BACKSLASH_B: // Test for word boundaries
3113 {
3114 UBool success = isWordBoundary(fp->fInputIdx);
3115 success ^= (UBool)(opValue != 0); // flip sense for \B
3116 if (!success) {
3117 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118 }
3119 }
3120 break;
3121
3122
3123 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3124 {
3125 UBool success = isUWordBoundary(fp->fInputIdx, status);
3126 success ^= (UBool)(opValue != 0); // flip sense for \B
3127 if (!success) {
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 }
3130 }
3131 break;
3132
3133
3134 case URX_BACKSLASH_D: // Test for decimal digit
3135 {
3136 if (fp->fInputIdx >= fActiveLimit) {
3137 fHitEnd = TRUE1;
3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139 break;
3140 }
3141
3142 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3143
3144 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3145 int8_t ctype = u_charTypeu_charType_71(c); // TODO: make a unicode set for this. Will be faster.
3146 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3147 success ^= (UBool)(opValue != 0); // flip sense for \D
3148 if (success) {
3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3150 } else {
3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152 }
3153 }
3154 break;
3155
3156
3157 case URX_BACKSLASH_G: // Test for position at end of previous match
3158 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE0 && fp->fInputIdx==fActiveStart))) {
3159 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160 }
3161 break;
3162
3163
3164 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3165 {
3166 if (fp->fInputIdx >= fActiveLimit) {
3167 fHitEnd = TRUE1;
3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169 break;
3170 }
3171 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3172 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3173 int8_t ctype = u_charTypeu_charType_71(c);
3174 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3175 success ^= (UBool)(opValue != 0); // flip sense for \H
3176 if (success) {
3177 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3178 } else {
3179 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180 }
3181 }
3182 break;
3183
3184
3185 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3186 {
3187 if (fp->fInputIdx >= fActiveLimit) {
3188 fHitEnd = TRUE1;
3189 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3190 break;
3191 }
3192 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3193 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3194 if (isLineTerminator(c)) {
3195 if (c == 0x0d && utext_current32utext_current32_71(fInputText) == 0x0a) {
3196 utext_next32utext_next32_71(fInputText);
3197 }
3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3199 } else {
3200 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201 }
3202 }
3203 break;
3204
3205
3206 case URX_BACKSLASH_V: // \v, any single line ending character.
3207 {
3208 if (fp->fInputIdx >= fActiveLimit) {
3209 fHitEnd = TRUE1;
3210 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211 break;
3212 }
3213 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3214 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3215 UBool success = isLineTerminator(c);
3216 success ^= (UBool)(opValue != 0); // flip sense for \V
3217 if (success) {
3218 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3219 } else {
3220 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3221 }
3222 }
3223 break;
3224
3225
3226 case URX_BACKSLASH_X:
3227 // Match a Grapheme, as defined by Unicode UAX 29.
3228
3229 // Fail if at end of input
3230 if (fp->fInputIdx >= fActiveLimit) {
3231 fHitEnd = TRUE1;
3232 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3233 break;
3234 }
3235
3236 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3237 if (fp->fInputIdx >= fActiveLimit) {
3238 fHitEnd = TRUE1;
3239 fp->fInputIdx = fActiveLimit;
3240 }
3241 break;
3242
3243
3244 case URX_BACKSLASH_Z: // Test for end of Input
3245 if (fp->fInputIdx < fAnchorLimit) {
3246 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3247 } else {
3248 fHitEnd = TRUE1;
3249 fRequireEnd = TRUE1;
3250 }
3251 break;
3252
3253
3254
3255 case URX_STATIC_SETREF:
3256 {
3257 // Test input character against one of the predefined sets
3258 // (Word Characters, for example)
3259 // The high bit of the op value is a flag for the match polarity.
3260 // 0: success if input char is in set.
3261 // 1: success if input char is not in set.
3262 if (fp->fInputIdx >= fActiveLimit) {
3263 fHitEnd = TRUE1;
3264 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3265 break;
3266 }
3267
3268 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3269 opValue &= ~URX_NEG_SET;
3270 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET)(void)0;
3271
3272 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3273 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3274 if (c < 256) {
3275 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3276 if (s8.contains(c)) {
3277 success = !success;
3278 }
3279 } else {
3280 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3281 if (s.contains(c)) {
3282 success = !success;
3283 }
3284 }
3285 if (success) {
3286 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3287 } else {
3288 // the character wasn't in the set.
3289 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3290 }
3291 }
3292 break;
3293
3294
3295 case URX_STAT_SETREF_N:
3296 {
3297 // Test input character for NOT being a member of one of
3298 // the predefined sets (Word Characters, for example)
3299 if (fp->fInputIdx >= fActiveLimit) {
3300 fHitEnd = TRUE1;
3301 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3302 break;
3303 }
3304
3305 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET)(void)0;
3306
3307 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3308
3309 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3310 if (c < 256) {
3311 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3312 if (s8.contains(c) == FALSE0) {
3313 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3314 break;
3315 }
3316 } else {
3317 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3318 if (s.contains(c) == FALSE0) {
3319 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3320 break;
3321 }
3322 }
3323 // the character wasn't in the set.
3324 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3325 }
3326 break;
3327
3328
3329 case URX_SETREF:
3330 if (fp->fInputIdx >= fActiveLimit) {
3331 fHitEnd = TRUE1;
3332 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3333 break;
3334 } else {
3335 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3336
3337 // There is input left. Pick up one char and test it for set membership.
3338 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3339 U_ASSERT(opValue > 0 && opValue < fSets->size())(void)0;
3340 if (c<256) {
3341 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3342 if (s8->contains(c)) {
3343 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3344 break;
3345 }
3346 } else {
3347 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3348 if (s->contains(c)) {
3349 // The character is in the set. A Match.
3350 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3351 break;
3352 }
3353 }
3354
3355 // the character wasn't in the set.
3356 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3357 }
3358 break;
3359
3360
3361 case URX_DOTANY:
3362 {
3363 // . matches anything, but stops at end-of-line.
3364 if (fp->fInputIdx >= fActiveLimit) {
3365 // At end of input. Match failed. Backtrack out.
3366 fHitEnd = TRUE1;
3367 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368 break;
3369 }
3370
3371 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3372
3373 // There is input left. Advance over one char, unless we've hit end-of-line
3374 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3375 if (isLineTerminator(c)) {
3376 // End of line in normal mode. . does not match.
3377 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3378 break;
3379 }
3380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3381 }
3382 break;
3383
3384
3385 case URX_DOTANY_ALL:
3386 {
3387 // ., in dot-matches-all (including new lines) mode
3388 if (fp->fInputIdx >= fActiveLimit) {
3389 // At end of input. Match failed. Backtrack out.
3390 fHitEnd = TRUE1;
3391 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3392 break;
3393 }
3394
3395 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3396
3397 // There is input left. Advance over one char, except if we are
3398 // at a cr/lf, advance over both of them.
3399 UChar32 c;
3400 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3401 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3402 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3403 // In the case of a CR/LF, we need to advance over both.
3404 UChar32 nextc = UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
;
3405 if (nextc == 0x0a) {
3406 (void)UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3407 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3408 }
3409 }
3410 }
3411 break;
3412
3413
3414 case URX_DOTANY_UNIX:
3415 {
3416 // '.' operator, matches all, but stops at end-of-line.
3417 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3418 if (fp->fInputIdx >= fActiveLimit) {
3419 // At end of input. Match failed. Backtrack out.
3420 fHitEnd = TRUE1;
3421 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422 break;
3423 }
3424
3425 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3426
3427 // There is input left. Advance over one char, unless we've hit end-of-line
3428 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3429 if (c == 0x0a) {
3430 // End of line in normal mode. '.' does not match the \n
3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432 } else {
3433 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3434 }
3435 }
3436 break;
3437
3438
3439 case URX_JMP:
3440 fp->fPatIdx = opValue;
3441 break;
3442
3443 case URX_FAIL:
3444 isMatch = FALSE0;
3445 goto breakFromLoop;
3446
3447 case URX_JMP_SAV:
3448 U_ASSERT(opValue < fPattern->fCompiledPat->size())(void)0;
3449 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3450 fp->fPatIdx = opValue; // Then JMP.
3451 break;
3452
3453 case URX_JMP_SAV_X:
3454 // This opcode is used with (x)+, when x can match a zero length string.
3455 // Same as JMP_SAV, except conditional on the match having made forward progress.
3456 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3457 // data address of the input position at the start of the loop.
3458 {
3459 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size())(void)0;
3460 int32_t stoOp = (int32_t)pat[opValue-1];
3461 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC)(void)0;
3462 int32_t frameLoc = URX_VAL(stoOp)((stoOp) & 0xffffff);
3463 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize)(void)0;
3464 int64_t prevInputIdx = fp->fExtra[frameLoc];
3465 U_ASSERT(prevInputIdx <= fp->fInputIdx)(void)0;
3466 if (prevInputIdx < fp->fInputIdx) {
3467 // The match did make progress. Repeat the loop.
3468 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3469 fp->fPatIdx = opValue;
3470 fp->fExtra[frameLoc] = fp->fInputIdx;
3471 }
3472 // If the input position did not advance, we do nothing here,
3473 // execution will fall out of the loop.
3474 }
3475 break;
3476
3477 case URX_CTR_INIT:
3478 {
3479 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2)(void)0;
3480 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3481
3482 // Pick up the three extra operands that CTR_INIT has, and
3483 // skip the pattern location counter past
3484 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3485 fp->fPatIdx += 3;
3486 int32_t loopLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
3487 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3488 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3489 U_ASSERT(minCount>=0)(void)0;
3490 U_ASSERT(maxCount>=minCount || maxCount==-1)(void)0;
3491 U_ASSERT(loopLoc>=fp->fPatIdx)(void)0;
3492
3493 if (minCount == 0) {
3494 fp = StateSave(fp, loopLoc+1, status);
3495 }
3496 if (maxCount == -1) {
3497 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3498 } else if (maxCount == 0) {
3499 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3500 }
3501 }
3502 break;
3503
3504 case URX_CTR_LOOP:
3505 {
3506 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2)(void)0;
3507 int32_t initOp = (int32_t)pat[opValue];
3508 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT)(void)0;
3509 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff)];
3510 int32_t minCount = (int32_t)pat[opValue+2];
3511 int32_t maxCount = (int32_t)pat[opValue+3];
3512 (*pCounter)++;
3513 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3514 U_ASSERT(*pCounter == maxCount)(void)0;
3515 break;
3516 }
3517 if (*pCounter >= minCount) {
3518 if (maxCount == -1) {
3519 // Loop has no hard upper bound.
3520 // Check that it is progressing through the input, break if it is not.
3521 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff) + 1];
3522 if (fp->fInputIdx == *pLastInputIdx) {
3523 break;
3524 } else {
3525 *pLastInputIdx = fp->fInputIdx;
3526 }
3527 }
3528 fp = StateSave(fp, fp->fPatIdx, status);
3529 } else {
3530 // Increment time-out counter. (StateSave() does it if count >= minCount)
3531 fTickCounter--;
3532 if (fTickCounter <= 0) {
3533 IncrementTime(status); // Re-initializes fTickCounter
3534 }
3535 }
3536
3537 fp->fPatIdx = opValue + 4; // Loop back.
3538 }
3539 break;
3540
3541 case URX_CTR_INIT_NG:
3542 {
3543 // Initialize a non-greedy loop
3544 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2)(void)0;
3545 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3546
3547 // Pick up the three extra operands that CTR_INIT_NG has, and
3548 // skip the pattern location counter past
3549 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3550 fp->fPatIdx += 3;
3551 int32_t loopLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
3552 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3553 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3554 U_ASSERT(minCount>=0)(void)0;
3555 U_ASSERT(maxCount>=minCount || maxCount==-1)(void)0;
3556 U_ASSERT(loopLoc>fp->fPatIdx)(void)0;
3557 if (maxCount == -1) {
3558 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3559 }
3560
3561 if (minCount == 0) {
3562 if (maxCount != 0) {
3563 fp = StateSave(fp, fp->fPatIdx, status);
3564 }
3565 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3566 }
3567 }
3568 break;
3569
3570 case URX_CTR_LOOP_NG:
3571 {
3572 // Non-greedy {min, max} loops
3573 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2)(void)0;
3574 int32_t initOp = (int32_t)pat[opValue];
3575 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG)(void)0;
3576 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff)];
3577 int32_t minCount = (int32_t)pat[opValue+2];
3578 int32_t maxCount = (int32_t)pat[opValue+3];
3579
3580 (*pCounter)++;
3581 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3582 // The loop has matched the maximum permitted number of times.
3583 // Break out of here with no action. Matching will
3584 // continue with the following pattern.
3585 U_ASSERT(*pCounter == maxCount)(void)0;
3586 break;
3587 }
3588
3589 if (*pCounter < minCount) {
3590 // We haven't met the minimum number of matches yet.
3591 // Loop back for another one.
3592 fp->fPatIdx = opValue + 4; // Loop back.
3593 // Increment time-out counter. (StateSave() does it if count >= minCount)
3594 fTickCounter--;
3595 if (fTickCounter <= 0) {
3596 IncrementTime(status); // Re-initializes fTickCounter
3597 }
3598 } else {
3599 // We do have the minimum number of matches.
3600
3601 // If there is no upper bound on the loop iterations, check that the input index
3602 // is progressing, and stop the loop if it is not.
3603 if (maxCount == -1) {
3604 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff) + 1];
3605 if (fp->fInputIdx == *pLastInputIdx) {
3606 break;
3607 }
3608 *pLastInputIdx = fp->fInputIdx;
3609 }
3610
3611 // Loop Continuation: we will fall into the pattern following the loop
3612 // (non-greedy, don't execute loop body first), but first do
3613 // a state save to the top of the loop, so that a match failure
3614 // in the following pattern will try another iteration of the loop.
3615 fp = StateSave(fp, opValue + 4, status);
3616 }
3617 }
3618 break;
3619
3620 case URX_STO_SP:
3621 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize)(void)0;
3622 fData[opValue] = fStack->size();
3623 break;
3624
3625 case URX_LD_SP:
3626 {
3627 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize)(void)0;
3628 int32_t newStackSize = (int32_t)fData[opValue];
3629 U_ASSERT(newStackSize <= fStack->size())(void)0;
3630 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3631 if (newFP == (int64_t *)fp) {
3632 break;
3633 }
3634 int32_t j;
3635 for (j=0; j<fFrameSize; j++) {
3636 newFP[j] = ((int64_t *)fp)[j];
3637 }
3638 fp = (REStackFrame *)newFP;
3639 fStack->setSize(newStackSize);
3640 }
3641 break;
3642
3643 case URX_BACKREF:
3644 {
3645 U_ASSERT(opValue < fFrameSize)(void)0;
3646 int64_t groupStartIdx = fp->fExtra[opValue];
3647 int64_t groupEndIdx = fp->fExtra[opValue+1];
3648 U_ASSERT(groupStartIdx <= groupEndIdx)(void)0;
3649 if (groupStartIdx < 0) {
3650 // This capture group has not participated in the match thus far,
3651 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3652 break;
3653 }
3654 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx)do { int64_t __offset = (groupStartIdx) - (fAltInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fAltInputText)->nativeIndexingLimit && (fAltInputText
)->chunkContents[__offset]<0xdc00) { (fAltInputText)->
chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71
((fAltInputText), (groupStartIdx)); } } while (false)
;
3655 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3656
3657 // Note: if the capture group match was of an empty string the backref
3658 // match succeeds. Verified by testing: Perl matches succeed
3659 // in this case, so we do too.
3660
3661 UBool success = TRUE1;
3662 for (;;) {
3663 if (utext_getNativeIndexutext_getNativeIndex_71(fAltInputText) >= groupEndIdx) {
3664 success = TRUE1;
3665 break;
3666 }
3667 if (utext_getNativeIndexutext_getNativeIndex_71(fInputText) >= fActiveLimit) {
3668 success = FALSE0;
3669 fHitEnd = TRUE1;
3670 break;
3671 }
3672 UChar32 captureGroupChar = utext_next32utext_next32_71(fAltInputText);
3673 UChar32 inputChar = utext_next32utext_next32_71(fInputText);
3674 if (inputChar != captureGroupChar) {
3675 success = FALSE0;
3676 break;
3677 }
3678 }
3679
3680 if (success) {
3681 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3682 } else {
3683 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3684 }
3685 }
3686 break;
3687
3688
3689
3690 case URX_BACKREF_I:
3691 {
3692 U_ASSERT(opValue < fFrameSize)(void)0;
3693 int64_t groupStartIdx = fp->fExtra[opValue];
3694 int64_t groupEndIdx = fp->fExtra[opValue+1];
3695 U_ASSERT(groupStartIdx <= groupEndIdx)(void)0;
3696 if (groupStartIdx < 0) {
3697 // This capture group has not participated in the match thus far,
3698 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3699 break;
3700 }
3701 utext_setNativeIndexutext_setNativeIndex_71(fAltInputText, groupStartIdx);
3702 utext_setNativeIndexutext_setNativeIndex_71(fInputText, fp->fInputIdx);
3703 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3704 CaseFoldingUTextIterator inputItr(*fInputText);
3705
3706 // Note: if the capture group match was of an empty string the backref
3707 // match succeeds. Verified by testing: Perl matches succeed
3708 // in this case, so we do too.
3709
3710 UBool success = TRUE1;
3711 for (;;) {
3712 if (!captureGroupItr.inExpansion() && utext_getNativeIndexutext_getNativeIndex_71(fAltInputText) >= groupEndIdx) {
3713 success = TRUE1;
3714 break;
3715 }
3716 if (!inputItr.inExpansion() && utext_getNativeIndexutext_getNativeIndex_71(fInputText) >= fActiveLimit) {
3717 success = FALSE0;
3718 fHitEnd = TRUE1;
3719 break;
3720 }
3721 UChar32 captureGroupChar = captureGroupItr.next();
3722 UChar32 inputChar = inputItr.next();
3723 if (inputChar != captureGroupChar) {
3724 success = FALSE0;
3725 break;
3726 }
3727 }
3728
3729 if (success && inputItr.inExpansion()) {
3730 // We obtained a match by consuming part of a string obtained from
3731 // case-folding a single code point of the input text.
3732 // This does not count as an overall match.
3733 success = FALSE0;
3734 }
3735
3736 if (success) {
3737 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3738 } else {
3739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3740 }
3741
3742 }
3743 break;
3744
3745 case URX_STO_INP_LOC:
3746 {
3747 U_ASSERT(opValue >= 0 && opValue < fFrameSize)(void)0;
3748 fp->fExtra[opValue] = fp->fInputIdx;
3749 }
3750 break;
3751
3752 case URX_JMPX:
3753 {
3754 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3755 fp->fPatIdx += 1;
3756 int32_t dataLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
3757 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize)(void)0;
3758 int64_t savedInputIdx = fp->fExtra[dataLoc];
3759 U_ASSERT(savedInputIdx <= fp->fInputIdx)(void)0;
3760 if (savedInputIdx < fp->fInputIdx) {
3761 fp->fPatIdx = opValue; // JMP
3762 } else {
3763 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3764 }
3765 }
3766 break;
3767
3768 case URX_LA_START:
3769 {
3770 // Entering a look around block.
3771 // Save Stack Ptr, Input Pos.
3772 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize)(void)0;
3773 fData[opValue] = fStack->size();
3774 fData[opValue+1] = fp->fInputIdx;
3775 fData[opValue+2] = fActiveStart;
3776 fData[opValue+3] = fActiveLimit;
3777 fActiveStart = fLookStart; // Set the match region change for
3778 fActiveLimit = fLookLimit; // transparent bounds.
3779 }
3780 break;
3781
3782 case URX_LA_END:
3783 {
3784 // Leaving a look-ahead block.
3785 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3786 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize)(void)0;
3787 int32_t stackSize = fStack->size();
3788 int32_t newStackSize =(int32_t)fData[opValue];
3789 U_ASSERT(stackSize >= newStackSize)(void)0;
3790 if (stackSize > newStackSize) {
3791 // Copy the current top frame back to the new (cut back) top frame.
3792 // This makes the capture groups from within the look-ahead
3793 // expression available.
3794 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3795 int32_t j;
3796 for (j=0; j<fFrameSize; j++) {
3797 newFP[j] = ((int64_t *)fp)[j];
3798 }
3799 fp = (REStackFrame *)newFP;
3800 fStack->setSize(newStackSize);
3801 }
3802 fp->fInputIdx = fData[opValue+1];
3803
3804 // Restore the active region bounds in the input string; they may have
3805 // been changed because of transparent bounds on a Region.
3806 fActiveStart = fData[opValue+2];
3807 fActiveLimit = fData[opValue+3];
3808 U_ASSERT(fActiveStart >= 0)(void)0;
3809 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
3810 }
3811 break;
3812
3813 case URX_ONECHAR_I:
3814 // Case insensitive one char. The char from the pattern is already case folded.
3815 // Input text is not, but case folding the input can not reduce two or more code
3816 // points to one.
3817 if (fp->fInputIdx < fActiveLimit) {
3818 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3819
3820 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
3821 if (u_foldCaseu_foldCase_71(c, U_FOLD_CASE_DEFAULT0) == opValue) {
3822 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3823 break;
3824 }
3825 } else {
3826 fHitEnd = TRUE1;
3827 }
3828
3829 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3830 break;
3831
3832 case URX_STRING_I:
3833 {
3834 // Case-insensitive test input against a literal string.
3835 // Strings require two slots in the compiled pattern, one for the
3836 // offset to the string text, and one for the length.
3837 // The compiled string has already been case folded.
3838 {
3839 const UChar *patternString = litText + opValue;
3840 int32_t patternStringIdx = 0;
3841
3842 op = (int32_t)pat[fp->fPatIdx];
3843 fp->fPatIdx++;
3844 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3845 opValue = URX_VAL(op)((op) & 0xffffff);
3846 U_ASSERT(opType == URX_STRING_LEN)(void)0;
3847 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3848
3849
3850 UChar32 cPattern;
3851 UChar32 cText;
3852 UBool success = TRUE1;
3853
3854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
3855 CaseFoldingUTextIterator inputIterator(*fInputText);
3856 while (patternStringIdx < patternStringLen) {
3857 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
>= fActiveLimit) {
3858 success = FALSE0;
3859 fHitEnd = TRUE1;
3860 break;
3861 }
3862 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern)do { (cPattern)=(patternString)[(patternStringIdx)++]; if((((
cPattern)&0xfffffc00)==0xd800)) { uint16_t __c2; if((patternStringIdx
)!=(patternStringLen) && (((__c2=(patternString)[(patternStringIdx
)])&0xfffffc00)==0xdc00)) { ++(patternStringIdx); (cPattern
)=(((UChar32)((cPattern))<<10UL)+(UChar32)(__c2)-((0xd800
<<10UL)+0xdc00-0x10000)); } } } while (false)
;
3863 cText = inputIterator.next();
3864 if (cText != cPattern) {
3865 success = FALSE0;
3866 break;
3867 }
3868 }
3869 if (inputIterator.inExpansion()) {
3870 success = FALSE0;
3871 }
3872
3873 if (success) {
3874 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3875 } else {
3876 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3877 }
3878 }
3879 }
3880 break;
3881
3882 case URX_LB_START:
3883 {
3884 // Entering a look-behind block.
3885 // Save Stack Ptr, Input Pos and active input region.
3886 // TODO: implement transparent bounds. Ticket #6067
3887 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
3888 fData[opValue] = fStack->size();
3889 fData[opValue+1] = fp->fInputIdx;
3890 // Save input string length, then reset to pin any matches to end at
3891 // the current position.
3892 fData[opValue+2] = fActiveStart;
3893 fData[opValue+3] = fActiveLimit;
3894 fActiveStart = fRegionStart;
3895 fActiveLimit = fp->fInputIdx;
3896 // Init the variable containing the start index for attempted matches.
3897 fData[opValue+4] = -1;
3898 }
3899 break;
3900
3901
3902 case URX_LB_CONT:
3903 {
3904 // Positive Look-Behind, at top of loop checking for matches of LB expression
3905 // at all possible input starting positions.
3906
3907 // Fetch the min and max possible match lengths. They are the operands
3908 // of this op in the pattern.
3909 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3910 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3911 if (!UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
3912 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3913 // The max length need not be exact; it just needs to be >= actual maximum.
3914 maxML *= 3;
3915 }
3916 U_ASSERT(minML <= maxML)(void)0;
3917 U_ASSERT(minML >= 0)(void)0;
3918
3919 // Fetch (from data) the last input index where a match was attempted.
3920 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
3921 int64_t &lbStartIdx = fData[opValue+4];
3922 if (lbStartIdx < 0) {
3923 // First time through loop.
3924 lbStartIdx = fp->fInputIdx - minML;
3925 if (lbStartIdx > 0) {
3926 // move index to a code point boundary, if it's not on one already.
3927 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx)do { int64_t __offset = (lbStartIdx) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (lbStartIdx
)); } } while (false)
;
3928 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3929 }
3930 } else {
3931 // 2nd through nth time through the loop.
3932 // Back up start position for match by one.
3933 if (lbStartIdx == 0) {
3934 (lbStartIdx)--;
3935 } else {
3936 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx)do { int64_t __offset = (lbStartIdx) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (lbStartIdx
)); } } while (false)
;
3937 (void)UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
3938 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
3939 }
3940 }
3941
3942 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3943 // We have tried all potential match starting points without
3944 // getting a match. Backtrack out, and out of the
3945 // Look Behind altogether.
3946 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3947 fActiveStart = fData[opValue+2];
3948 fActiveLimit = fData[opValue+3];
3949 U_ASSERT(fActiveStart >= 0)(void)0;
3950 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
3951 break;
3952 }
3953
3954 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3955 // (successful match will fall off the end of the loop.)
3956 fp = StateSave(fp, fp->fPatIdx-3, status);
3957 fp->fInputIdx = lbStartIdx;
3958 }
3959 break;
3960
3961 case URX_LB_END:
3962 // End of a look-behind block, after a successful match.
3963 {
3964 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
3965 if (fp->fInputIdx != fActiveLimit) {
3966 // The look-behind expression matched, but the match did not
3967 // extend all the way to the point that we are looking behind from.
3968 // FAIL out of here, which will take us back to the LB_CONT, which
3969 // will retry the match starting at another position or fail
3970 // the look-behind altogether, whichever is appropriate.
3971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3972 break;
3973 }
3974
3975 // Look-behind match is good. Restore the original input string region,
3976 // which had been truncated to pin the end of the lookbehind match to the
3977 // position being looked-behind.
3978 fActiveStart = fData[opValue+2];
3979 fActiveLimit = fData[opValue+3];
3980 U_ASSERT(fActiveStart >= 0)(void)0;
3981 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
3982 }
3983 break;
3984
3985
3986 case URX_LBN_CONT:
3987 {
3988 // Negative Look-Behind, at top of loop checking for matches of LB expression
3989 // at all possible input starting positions.
3990
3991 // Fetch the extra parameters of this op.
3992 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3993 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3994 if (!UTEXT_USES_U16(fInputText)(__null==((fInputText)->pFuncs->mapNativeIndexToUTF16))) {
3995 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3996 // The max length need not be exact; it just needs to be >= actual maximum.
3997 maxML *= 3;
3998 }
3999 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4000 continueLoc = URX_VAL(continueLoc)((continueLoc) & 0xffffff);
4001 U_ASSERT(minML <= maxML)(void)0;
4002 U_ASSERT(minML >= 0)(void)0;
4003 U_ASSERT(continueLoc > fp->fPatIdx)(void)0;
4004
4005 // Fetch (from data) the last input index where a match was attempted.
4006 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
4007 int64_t &lbStartIdx = fData[opValue+4];
4008 if (lbStartIdx < 0) {
4009 // First time through loop.
4010 lbStartIdx = fp->fInputIdx - minML;
4011 if (lbStartIdx > 0) {
4012 // move index to a code point boundary, if it's not on one already.
4013 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx)do { int64_t __offset = (lbStartIdx) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (lbStartIdx
)); } } while (false)
;
4014 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4015 }
4016 } else {
4017 // 2nd through nth time through the loop.
4018 // Back up start position for match by one.
4019 if (lbStartIdx == 0) {
4020 (lbStartIdx)--;
4021 } else {
4022 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx)do { int64_t __offset = (lbStartIdx) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (lbStartIdx
)); } } while (false)
;
4023 (void)UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
4024 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4025 }
4026 }
4027
4028 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4029 // We have tried all potential match starting points without
4030 // getting a match, which means that the negative lookbehind as
4031 // a whole has succeeded. Jump forward to the continue location
4032 fActiveStart = fData[opValue+2];
4033 fActiveLimit = fData[opValue+3];
4034 U_ASSERT(fActiveStart >= 0)(void)0;
4035 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
4036 fp->fPatIdx = continueLoc;
4037 break;
4038 }
4039
4040 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4041 // (successful match will cause a FAIL out of the loop altogether.)
4042 fp = StateSave(fp, fp->fPatIdx-4, status);
4043 fp->fInputIdx = lbStartIdx;
4044 }
4045 break;
4046
4047 case URX_LBN_END:
4048 // End of a negative look-behind block, after a successful match.
4049 {
4050 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
4051 if (fp->fInputIdx != fActiveLimit) {
4052 // The look-behind expression matched, but the match did not
4053 // extend all the way to the point that we are looking behind from.
4054 // FAIL out of here, which will take us back to the LB_CONT, which
4055 // will retry the match starting at another position or succeed
4056 // the look-behind altogether, whichever is appropriate.
4057 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4058 break;
4059 }
4060
4061 // Look-behind expression matched, which means look-behind test as
4062 // a whole Fails
4063
4064 // Restore the original input string length, which had been truncated
4065 // inorder to pin the end of the lookbehind match
4066 // to the position being looked-behind.
4067 fActiveStart = fData[opValue+2];
4068 fActiveLimit = fData[opValue+3];
4069 U_ASSERT(fActiveStart >= 0)(void)0;
4070 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
4071
4072 // Restore original stack position, discarding any state saved
4073 // by the successful pattern match.
4074 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize)(void)0;
4075 int32_t newStackSize = (int32_t)fData[opValue];
4076 U_ASSERT(fStack->size() > newStackSize)(void)0;
4077 fStack->setSize(newStackSize);
4078
4079 // FAIL, which will take control back to someplace
4080 // prior to entering the look-behind test.
4081 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4082 }
4083 break;
4084
4085
4086 case URX_LOOP_SR_I:
4087 // Loop Initialization for the optimized implementation of
4088 // [some character set]*
4089 // This op scans through all matching input.
4090 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4091 {
4092 U_ASSERT(opValue > 0 && opValue < fSets->size())(void)0;
4093 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4094 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4095
4096 // Loop through input, until either the input is exhausted or
4097 // we reach a character that is not a member of the set.
4098 int64_t ix = fp->fInputIdx;
4099 UTEXT_SETNATIVEINDEX(fInputText, ix)do { int64_t __offset = (ix) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (ix
)); } } while (false)
;
4100 for (;;) {
4101 if (ix >= fActiveLimit) {
4102 fHitEnd = TRUE1;
4103 break;
4104 }
4105 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
4106 if (c<256) {
4107 if (s8->contains(c) == FALSE0) {
4108 break;
4109 }
4110 } else {
4111 if (s->contains(c) == FALSE0) {
4112 break;
4113 }
4114 }
4115 ix = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4116 }
4117
4118 // If there were no matching characters, skip over the loop altogether.
4119 // The loop doesn't run at all, a * op always succeeds.
4120 if (ix == fp->fInputIdx) {
4121 fp->fPatIdx++; // skip the URX_LOOP_C op.
4122 break;
4123 }
4124
4125 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4126 // must follow. It's operand is the stack location
4127 // that holds the starting input index for the match of this [set]*
4128 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4129 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C)(void)0;
4130 int32_t stackLoc = URX_VAL(loopcOp)((loopcOp) & 0xffffff);
4131 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize)(void)0;
4132 fp->fExtra[stackLoc] = fp->fInputIdx;
4133 fp->fInputIdx = ix;
4134
4135 // Save State to the URX_LOOP_C op that follows this one,
4136 // so that match failures in the following code will return to there.
4137 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4138 fp = StateSave(fp, fp->fPatIdx, status);
4139 fp->fPatIdx++;
4140 }
4141 break;
4142
4143
4144 case URX_LOOP_DOT_I:
4145 // Loop Initialization for the optimized implementation of .*
4146 // This op scans through all remaining input.
4147 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4148 {
4149 // Loop through input until the input is exhausted (we reach an end-of-line)
4150 // In DOTALL mode, we can just go straight to the end of the input.
4151 int64_t ix;
4152 if ((opValue & 1) == 1) {
4153 // Dot-matches-All mode. Jump straight to the end of the string.
4154 ix = fActiveLimit;
4155 fHitEnd = TRUE1;
4156 } else {
4157 // NOT DOT ALL mode. Line endings do not match '.'
4158 // Scan forward until a line ending or end of input.
4159 ix = fp->fInputIdx;
4160 UTEXT_SETNATIVEINDEX(fInputText, ix)do { int64_t __offset = (ix) - (fInputText)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fInputText
)->nativeIndexingLimit && (fInputText)->chunkContents
[__offset]<0xdc00) { (fInputText)->chunkOffset=(int32_t
)__offset; } else { utext_setNativeIndex_71((fInputText), (ix
)); } } while (false)
;
4161 for (;;) {
4162 if (ix >= fActiveLimit) {
4163 fHitEnd = TRUE1;
4164 break;
4165 }
4166 UChar32 c = UTEXT_NEXT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)++] : utext_next32_71(fInputText))
;
4167 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4168 if ((c == 0x0a) || // 0x0a is newline in both modes.
4169 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4170 isLineTerminator(c))) {
4171 // char is a line ending. Exit the scanning loop.
4172 break;
4173 }
4174 }
4175 ix = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4176 }
4177 }
4178
4179 // If there were no matching characters, skip over the loop altogether.
4180 // The loop doesn't run at all, a * op always succeeds.
4181 if (ix == fp->fInputIdx) {
4182 fp->fPatIdx++; // skip the URX_LOOP_C op.
4183 break;
4184 }
4185
4186 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4187 // must follow. It's operand is the stack location
4188 // that holds the starting input index for the match of this .*
4189 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4190 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C)(void)0;
4191 int32_t stackLoc = URX_VAL(loopcOp)((loopcOp) & 0xffffff);
4192 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize)(void)0;
4193 fp->fExtra[stackLoc] = fp->fInputIdx;
4194 fp->fInputIdx = ix;
4195
4196 // Save State to the URX_LOOP_C op that follows this one,
4197 // so that match failures in the following code will return to there.
4198 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4199 fp = StateSave(fp, fp->fPatIdx, status);
4200 fp->fPatIdx++;
4201 }
4202 break;
4203
4204
4205 case URX_LOOP_C:
4206 {
4207 U_ASSERT(opValue>=0 && opValue<fFrameSize)(void)0;
4208 backSearchIndex = fp->fExtra[opValue];
4209 U_ASSERT(backSearchIndex <= fp->fInputIdx)(void)0;
4210 if (backSearchIndex == fp->fInputIdx) {
4211 // We've backed up the input idx to the point that the loop started.
4212 // The loop is done. Leave here without saving state.
4213 // Subsequent failures won't come back here.
4214 break;
4215 }
4216 // Set up for the next iteration of the loop, with input index
4217 // backed up by one from the last time through,
4218 // and a state save to this instruction in case the following code fails again.
4219 // (We're going backwards because this loop emulates stack unwinding, not
4220 // the initial scan forward.)
4221 U_ASSERT(fp->fInputIdx > 0)(void)0;
4222 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
4223 UChar32 prevC = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
4224 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4225
4226 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText)((fInputText)->chunkOffset > 0 && (fInputText)->
chunkContents[(fInputText)->chunkOffset-1] < 0xd800 ? (
fInputText)->chunkContents[--((fInputText)->chunkOffset
)] : utext_previous32_71(fInputText))
;
4227 if (prevC == 0x0a &&
4228 fp->fInputIdx > backSearchIndex &&
4229 twoPrevC == 0x0d) {
4230 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4231 if (URX_TYPE(prevOp)((uint32_t)(prevOp) >> 24) == URX_LOOP_DOT_I) {
4232 // .*, stepping back over CRLF pair.
4233 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)((fInputText)->chunkOffset <= (fInputText)->nativeIndexingLimit
? (fInputText)->chunkNativeStart+(fInputText)->chunkOffset
: (fInputText)->pFuncs->mapOffsetToNative(fInputText))
;
4234 }
4235 }
4236
4237
4238 fp = StateSave(fp, fp->fPatIdx-1, status);
4239 }
4240 break;
4241
4242
4243
4244 default:
4245 // Trouble. The compiled pattern contains an entry with an
4246 // unrecognized type tag.
4247 UPRV_UNREACHABLE_ASSERT(void)0;
4248 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4249 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4250 // See ICU-21669.
4251 status = U_INTERNAL_PROGRAM_ERROR;
4252 }
4253
4254 if (U_FAILURE(status)) {
4255 isMatch = FALSE0;
4256 break;
4257 }
4258 }
4259
4260breakFromLoop:
4261 fMatch = isMatch;
4262 if (isMatch) {
4263 fLastMatchEnd = fMatchEnd;
4264 fMatchStart = startIdx;
4265 fMatchEnd = fp->fInputIdx;
4266 }
4267
4268#ifdef REGEX_RUN_DEBUG
4269 if (fTraceDebug) {
4270 if (isMatch) {
4271 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4272 } else {
4273 printf("No match\n\n");
4274 }
4275 }
4276#endif
4277
4278 fFrame = fp; // The active stack frame when the engine stopped.
4279 // Contains the capture group results that we need to
4280 // access later.
4281 return;
4282}
4283
4284
4285//--------------------------------------------------------------------------------
4286//
4287// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4288// assumption that the entire string is available in the UText's
4289// chunk buffer. For now, that means we can use int32_t indexes,
4290// except for anything that needs to be saved (like group starts
4291// and ends).
4292//
4293// startIdx: begin matching a this index.
4294// toEnd: if true, match must extend to end of the input region
4295//
4296//--------------------------------------------------------------------------------
4297void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4298 UBool isMatch = FALSE0; // True if the we have a match.
4299
4300 int32_t backSearchIndex = INT32_MAX(2147483647); // used after greedy single-character matches for searching backwards
4301
4302 int32_t op; // Operation from the compiled pattern, split into
4303 int32_t opType; // the opcode
4304 int32_t opValue; // and the operand value.
4305
4306#ifdef REGEX_RUN_DEBUG
4307 if (fTraceDebug) {
4308 printf("MatchAt(startIdx=%d)\n", startIdx);
4309 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4310 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4311 }
4312#endif
4313
4314 if (U_FAILURE(status)) {
1
Taking false branch
4315 return;
4316 }
4317
4318 // Cache frequently referenced items from the compiled pattern
4319 //
4320 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4321
4322 const UChar *litText = fPattern->fLiteralText.getBuffer();
4323 UVector *fSets = fPattern->fSets;
4324
4325 const UChar *inputBuf = fInputText->chunkContents;
4326
4327 fFrameSize = fPattern->fFrameSize;
4328 REStackFrame *fp = resetStack();
4329 if (U_FAILURE(fDeferredStatus)) {
2
Taking false branch
4330 status = fDeferredStatus;
4331 return;
4332 }
4333
4334 fp->fPatIdx = 0;
4335 fp->fInputIdx = startIdx;
4336
4337 // Zero out the pattern's static data
4338 int32_t i;
4339 for (i = 0; i<fPattern->fDataSize; i++) {
3
Assuming 'i' is >= field 'fDataSize'
4
Loop condition is false. Execution continues on line 4347
4340 fData[i] = 0;
4341 }
4342
4343 //
4344 // Main loop for interpreting the compiled pattern.
4345 // One iteration of the loop per pattern operation performed.
4346 //
4347 for (;;) {
5
Loop condition is true. Entering loop body
4348 op = (int32_t)pat[fp->fPatIdx];
4349 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
4350 opValue = URX_VAL(op)((op) & 0xffffff);
4351#ifdef REGEX_RUN_DEBUG
4352 if (fTraceDebug) {
4353 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx)do { int64_t __offset = (fp->fInputIdx) - (fInputText)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fInputText)->nativeIndexingLimit && (fInputText
)->chunkContents[__offset]<0xdc00) { (fInputText)->chunkOffset
=(int32_t)__offset; } else { utext_setNativeIndex_71((fInputText
), (fp->fInputIdx)); } } while (false)
;
4354 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4355 UTEXT_CURRENT32(fInputText)((fInputText)->chunkOffset < (fInputText)->chunkLength
&& ((fInputText)->chunkContents)[(fInputText)->
chunkOffset]<0xd800 ? ((fInputText)->chunkContents)[((fInputText
)->chunkOffset)] : utext_current32_71(fInputText))
, (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4356 fPattern->dumpOp(fp->fPatIdx);
4357 }
4358#endif
4359 fp->fPatIdx++;
4360
4361 switch (opType) {
6
Control jumps to 'case URX_STRING:' at line 4390
4362
4363
4364 case URX_NOP:
4365 break;
4366
4367
4368 case URX_BACKTRACK:
4369 // Force a backtrack. In some circumstances, the pattern compiler
4370 // will notice that the pattern can't possibly match anything, and will
4371 // emit one of these at that point.
4372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373 break;
4374
4375
4376 case URX_ONECHAR:
4377 if (fp->fInputIdx < fActiveLimit) {
4378 UChar32 c;
4379 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4380 if (c == opValue) {
4381 break;
4382 }
4383 } else {
4384 fHitEnd = TRUE1;
4385 }
4386 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4387 break;
4388
4389
4390 case URX_STRING:
4391 {
4392 // Test input against a literal string.
4393 // Strings require two slots in the compiled pattern, one for the
4394 // offset to the string text, and one for the length.
4395 int32_t stringStartIdx = opValue;
4396 int32_t stringLen;
4397
4398 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4399 fp->fPatIdx++;
4400 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
4401 stringLen = URX_VAL(op)((op) & 0xffffff);
4402 U_ASSERT(opType == URX_STRING_LEN)(void)0;
4403 U_ASSERT(stringLen >= 2)(void)0;
4404
4405 const UChar * pInp = inputBuf + fp->fInputIdx;
4406 const UChar * pInpLimit = inputBuf + fActiveLimit;
4407 const UChar * pPat = litText+stringStartIdx;
4408 const UChar * pEnd = pInp + stringLen;
4409 UBool success = TRUE1;
4410 while (pInp < pEnd) {
7
Assuming 'pInp' is < 'pEnd'
8
Loop condition is true. Entering loop body
4411 if (pInp >= pInpLimit) {
9
Assuming 'pInp' is < 'pInpLimit'
10
Taking false branch
4412 fHitEnd = TRUE1;
4413 success = FALSE0;
4414 break;
4415 }
4416 if (*pInp++ != *pPat++) {
11
Null pointer value stored to 'pPat'
12
Dereference of null pointer
4417 success = FALSE0;
4418 break;
4419 }
4420 }
4421
4422 if (success) {
4423 fp->fInputIdx += stringLen;
4424 } else {
4425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426 }
4427 }
4428 break;
4429
4430
4431 case URX_STATE_SAVE:
4432 fp = StateSave(fp, opValue, status);
4433 break;
4434
4435
4436 case URX_END:
4437 // The match loop will exit via this path on a successful match,
4438 // when we reach the end of the pattern.
4439 if (toEnd && fp->fInputIdx != fActiveLimit) {
4440 // The pattern matched, but not to the end of input. Try some more.
4441 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4442 break;
4443 }
4444 isMatch = TRUE1;
4445 goto breakFromLoop;
4446
4447 // Start and End Capture stack frame variables are laid out out like this:
4448 // fp->fExtra[opValue] - The start of a completed capture group
4449 // opValue+1 - The end of a completed capture group
4450 // opValue+2 - the start of a capture group whose end
4451 // has not yet been reached (and might not ever be).
4452 case URX_START_CAPTURE:
4453 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3)(void)0;
4454 fp->fExtra[opValue+2] = fp->fInputIdx;
4455 break;
4456
4457
4458 case URX_END_CAPTURE:
4459 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3)(void)0;
4460 U_ASSERT(fp->fExtra[opValue+2] >= 0)(void)0; // Start pos for this group must be set.
4461 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4462 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4463 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1])(void)0;
4464 break;
4465
4466
4467 case URX_DOLLAR: // $, test for End of line
4468 // or for position before new line at end of input
4469 if (fp->fInputIdx < fAnchorLimit-2) {
4470 // We are no where near the end of input. Fail.
4471 // This is the common case. Keep it first.
4472 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4473 break;
4474 }
4475 if (fp->fInputIdx >= fAnchorLimit) {
4476 // We really are at the end of input. Success.
4477 fHitEnd = TRUE1;
4478 fRequireEnd = TRUE1;
4479 break;
4480 }
4481
4482 // If we are positioned just before a new-line that is located at the
4483 // end of input, succeed.
4484 if (fp->fInputIdx == fAnchorLimit-1) {
4485 UChar32 c;
4486 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c)do { (c)=(inputBuf)[fp->fInputIdx]; if((((c)&0xfffff800
)==0xd800)) { uint16_t __c2; if((((c)&0x400)==0)) { if((fp
->fInputIdx)+1!=(fAnchorLimit) && (((__c2=(inputBuf
)[(fp->fInputIdx)+1])&0xfffffc00)==0xdc00)) { (c)=(((UChar32
)((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00
-0x10000)); } } else { if((fp->fInputIdx)>(fAnchorStart
) && (((__c2=(inputBuf)[(fp->fInputIdx)-1])&0xfffffc00
)==0xd800)) { (c)=(((UChar32)(__c2)<<10UL)+(UChar32)((c
))-((0xd800<<10UL)+0xdc00-0x10000)); } } } } while (false
)
;
4487
4488 if (isLineTerminator(c)) {
4489 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4490 // At new-line at end of input. Success
4491 fHitEnd = TRUE1;
4492 fRequireEnd = TRUE1;
4493 break;
4494 }
4495 }
4496 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4497 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4498 fHitEnd = TRUE1;
4499 fRequireEnd = TRUE1;
4500 break; // At CR/LF at end of input. Success
4501 }
4502
4503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504
4505 break;
4506
4507
4508 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4509 if (fp->fInputIdx >= fAnchorLimit-1) {
4510 // Either at the last character of input, or off the end.
4511 if (fp->fInputIdx == fAnchorLimit-1) {
4512 // At last char of input. Success if it's a new line.
4513 if (inputBuf[fp->fInputIdx] == 0x0a) {
4514 fHitEnd = TRUE1;
4515 fRequireEnd = TRUE1;
4516 break;
4517 }
4518 } else {
4519 // Off the end of input. Success.
4520 fHitEnd = TRUE1;
4521 fRequireEnd = TRUE1;
4522 break;
4523 }
4524 }
4525
4526 // Not at end of input. Back-track out.
4527 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4528 break;
4529
4530
4531 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4532 {
4533 if (fp->fInputIdx >= fAnchorLimit) {
4534 // We really are at the end of input. Success.
4535 fHitEnd = TRUE1;
4536 fRequireEnd = TRUE1;
4537 break;
4538 }
4539 // If we are positioned just before a new-line, succeed.
4540 // It makes no difference where the new-line is within the input.
4541 UChar32 c = inputBuf[fp->fInputIdx];
4542 if (isLineTerminator(c)) {
4543 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4544 // In multi-line mode, hitting a new-line just before the end of input does not
4545 // set the hitEnd or requireEnd flags
4546 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4547 break;
4548 }
4549 }
4550 // not at a new line. Fail.
4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552 }
4553 break;
4554
4555
4556 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4557 {
4558 if (fp->fInputIdx >= fAnchorLimit) {
4559 // We really are at the end of input. Success.
4560 fHitEnd = TRUE1;
4561 fRequireEnd = TRUE1; // Java set requireEnd in this case, even though
4562 break; // adding a new-line would not lose the match.
4563 }
4564 // If we are not positioned just before a new-line, the test fails; backtrack out.
4565 // It makes no difference where the new-line is within the input.
4566 if (inputBuf[fp->fInputIdx] != 0x0a) {
4567 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4568 }
4569 }
4570 break;
4571
4572
4573 case URX_CARET: // ^, test for start of line
4574 if (fp->fInputIdx != fAnchorStart) {
4575 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4576 }
4577 break;
4578
4579
4580 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4581 {
4582 if (fp->fInputIdx == fAnchorStart) {
4583 // We are at the start input. Success.
4584 break;
4585 }
4586 // Check whether character just before the current pos is a new-line
4587 // unless we are at the end of input
4588 UChar c = inputBuf[fp->fInputIdx - 1];
4589 if ((fp->fInputIdx < fAnchorLimit) &&
4590 isLineTerminator(c)) {
4591 // It's a new-line. ^ is true. Success.
4592 // TODO: what should be done with positions between a CR and LF?
4593 break;
4594 }
4595 // Not at the start of a line. Fail.
4596 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4597 }
4598 break;
4599
4600
4601 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4602 {
4603 U_ASSERT(fp->fInputIdx >= fAnchorStart)(void)0;
4604 if (fp->fInputIdx <= fAnchorStart) {
4605 // We are at the start input. Success.
4606 break;
4607 }
4608 // Check whether character just before the current pos is a new-line
4609 U_ASSERT(fp->fInputIdx <= fAnchorLimit)(void)0;
4610 UChar c = inputBuf[fp->fInputIdx - 1];
4611 if (c != 0x0a) {
4612 // Not at the start of a line. Back-track out.
4613 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4614 }
4615 }
4616 break;
4617
4618 case URX_BACKSLASH_B: // Test for word boundaries
4619 {
4620 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4621 success ^= (UBool)(opValue != 0); // flip sense for \B
4622 if (!success) {
4623 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4624 }
4625 }
4626 break;
4627
4628
4629 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4630 {
4631 UBool success = isUWordBoundary(fp->fInputIdx, status);
4632 success ^= (UBool)(opValue != 0); // flip sense for \B
4633 if (!success) {
4634 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4635 }
4636 }
4637 break;
4638
4639
4640 case URX_BACKSLASH_D: // Test for decimal digit
4641 {
4642 if (fp->fInputIdx >= fActiveLimit) {
4643 fHitEnd = TRUE1;
4644 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4645 break;
4646 }
4647
4648 UChar32 c;
4649 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4650 int8_t ctype = u_charTypeu_charType_71(c); // TODO: make a unicode set for this. Will be faster.
4651 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4652 success ^= (UBool)(opValue != 0); // flip sense for \D
4653 if (!success) {
4654 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4655 }
4656 }
4657 break;
4658
4659
4660 case URX_BACKSLASH_G: // Test for position at end of previous match
4661 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE0 && fp->fInputIdx==fActiveStart))) {
4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663 }
4664 break;
4665
4666
4667 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4668 {
4669 if (fp->fInputIdx >= fActiveLimit) {
4670 fHitEnd = TRUE1;
4671 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4672 break;
4673 }
4674 UChar32 c;
4675 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4676 int8_t ctype = u_charTypeu_charType_71(c);
4677 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4678 success ^= (UBool)(opValue != 0); // flip sense for \H
4679 if (!success) {
4680 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681 }
4682 }
4683 break;
4684
4685
4686 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4687 {
4688 if (fp->fInputIdx >= fActiveLimit) {
4689 fHitEnd = TRUE1;
4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691 break;
4692 }
4693 UChar32 c;
4694 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4695 if (isLineTerminator(c)) {
4696 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4697 // Check for CR/LF sequence. Consume both together when found.
4698 UChar c2;
4699 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2)do { (c2)=(inputBuf)[(fp->fInputIdx)++]; if((((c2)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c2)=(((UChar32)((c2))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4700 if (c2 != 0x0a) {
4701 U16_PREV(inputBuf, 0, fp->fInputIdx, c2)do { (c2)=(inputBuf)[--(fp->fInputIdx)]; if((((c2)&0xfffffc00
)==0xdc00)) { uint16_t __c2; if((fp->fInputIdx)>(0) &&
(((__c2=(inputBuf)[(fp->fInputIdx)-1])&0xfffffc00)==0xd800
)) { --(fp->fInputIdx); (c2)=(((UChar32)(__c2)<<10UL
)+(UChar32)((c2))-((0xd800<<10UL)+0xdc00-0x10000)); } }
} while (false)
;
4702 }
4703 }
4704 } else {
4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706 }
4707 }
4708 break;
4709
4710
4711 case URX_BACKSLASH_V: // Any single code point line ending.
4712 {
4713 if (fp->fInputIdx >= fActiveLimit) {
4714 fHitEnd = TRUE1;
4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716 break;
4717 }
4718 UChar32 c;
4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4720 UBool success = isLineTerminator(c);
4721 success ^= (UBool)(opValue != 0); // flip sense for \V
4722 if (!success) {
4723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724 }
4725 }
4726 break;
4727
4728
4729 case URX_BACKSLASH_X:
4730 // Match a Grapheme, as defined by Unicode UAX 29.
4731
4732 // Fail if at end of input
4733 if (fp->fInputIdx >= fActiveLimit) {
4734 fHitEnd = TRUE1;
4735 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736 break;
4737 }
4738
4739 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4740 if (fp->fInputIdx >= fActiveLimit) {
4741 fHitEnd = TRUE1;
4742 fp->fInputIdx = fActiveLimit;
4743 }
4744 break;
4745
4746
4747 case URX_BACKSLASH_Z: // Test for end of Input
4748 if (fp->fInputIdx < fAnchorLimit) {
4749 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750 } else {
4751 fHitEnd = TRUE1;
4752 fRequireEnd = TRUE1;
4753 }
4754 break;
4755
4756
4757
4758 case URX_STATIC_SETREF:
4759 {
4760 // Test input character against one of the predefined sets
4761 // (Word Characters, for example)
4762 // The high bit of the op value is a flag for the match polarity.
4763 // 0: success if input char is in set.
4764 // 1: success if input char is not in set.
4765 if (fp->fInputIdx >= fActiveLimit) {
4766 fHitEnd = TRUE1;
4767 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4768 break;
4769 }
4770
4771 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4772 opValue &= ~URX_NEG_SET;
4773 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET)(void)0;
4774
4775 UChar32 c;
4776 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4777 if (c < 256) {
4778 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4779 if (s8.contains(c)) {
4780 success = !success;
4781 }
4782 } else {
4783 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4784 if (s.contains(c)) {
4785 success = !success;
4786 }
4787 }
4788 if (!success) {
4789 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4790 }
4791 }
4792 break;
4793
4794
4795 case URX_STAT_SETREF_N:
4796 {
4797 // Test input character for NOT being a member of one of
4798 // the predefined sets (Word Characters, for example)
4799 if (fp->fInputIdx >= fActiveLimit) {
4800 fHitEnd = TRUE1;
4801 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4802 break;
4803 }
4804
4805 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET)(void)0;
4806
4807 UChar32 c;
4808 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4809 if (c < 256) {
4810 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4811 if (s8.contains(c) == FALSE0) {
4812 break;
4813 }
4814 } else {
4815 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4816 if (s.contains(c) == FALSE0) {
4817 break;
4818 }
4819 }
4820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4821 }
4822 break;
4823
4824
4825 case URX_SETREF:
4826 {
4827 if (fp->fInputIdx >= fActiveLimit) {
4828 fHitEnd = TRUE1;
4829 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830 break;
4831 }
4832
4833 U_ASSERT(opValue > 0 && opValue < fSets->size())(void)0;
4834
4835 // There is input left. Pick up one char and test it for set membership.
4836 UChar32 c;
4837 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4838 if (c<256) {
4839 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4840 if (s8->contains(c)) {
4841 // The character is in the set. A Match.
4842 break;
4843 }
4844 } else {
4845 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4846 if (s->contains(c)) {
4847 // The character is in the set. A Match.
4848 break;
4849 }
4850 }
4851
4852 // the character wasn't in the set.
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854 }
4855 break;
4856
4857
4858 case URX_DOTANY:
4859 {
4860 // . matches anything, but stops at end-of-line.
4861 if (fp->fInputIdx >= fActiveLimit) {
4862 // At end of input. Match failed. Backtrack out.
4863 fHitEnd = TRUE1;
4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865 break;
4866 }
4867
4868 // There is input left. Advance over one char, unless we've hit end-of-line
4869 UChar32 c;
4870 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4871 if (isLineTerminator(c)) {
4872 // End of line in normal mode. . does not match.
4873 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4874 break;
4875 }
4876 }
4877 break;
4878
4879
4880 case URX_DOTANY_ALL:
4881 {
4882 // . in dot-matches-all (including new lines) mode
4883 if (fp->fInputIdx >= fActiveLimit) {
4884 // At end of input. Match failed. Backtrack out.
4885 fHitEnd = TRUE1;
4886 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4887 break;
4888 }
4889
4890 // There is input left. Advance over one char, except if we are
4891 // at a cr/lf, advance over both of them.
4892 UChar32 c;
4893 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4894 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4895 // In the case of a CR/LF, we need to advance over both.
4896 if (inputBuf[fp->fInputIdx] == 0x0a) {
4897 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit)do { if(((((inputBuf)[(fp->fInputIdx)++])&0xfffffc00)==
0xd800) && (fp->fInputIdx)!=(fActiveLimit) &&
((((inputBuf)[fp->fInputIdx])&0xfffffc00)==0xdc00)) {
++(fp->fInputIdx); } } while (false)
;
4898 }
4899 }
4900 }
4901 break;
4902
4903
4904 case URX_DOTANY_UNIX:
4905 {
4906 // '.' operator, matches all, but stops at end-of-line.
4907 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4908 if (fp->fInputIdx >= fActiveLimit) {
4909 // At end of input. Match failed. Backtrack out.
4910 fHitEnd = TRUE1;
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912 break;
4913 }
4914
4915 // There is input left. Advance over one char, unless we've hit end-of-line
4916 UChar32 c;
4917 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
4918 if (c == 0x0a) {
4919 // End of line in normal mode. '.' does not match the \n
4920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4921 }
4922 }
4923 break;
4924
4925
4926 case URX_JMP:
4927 fp->fPatIdx = opValue;
4928 break;
4929
4930 case URX_FAIL:
4931 isMatch = FALSE0;
4932 goto breakFromLoop;
4933
4934 case URX_JMP_SAV:
4935 U_ASSERT(opValue < fPattern->fCompiledPat->size())(void)0;
4936 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4937 fp->fPatIdx = opValue; // Then JMP.
4938 break;
4939
4940 case URX_JMP_SAV_X:
4941 // This opcode is used with (x)+, when x can match a zero length string.
4942 // Same as JMP_SAV, except conditional on the match having made forward progress.
4943 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4944 // data address of the input position at the start of the loop.
4945 {
4946 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size())(void)0;
4947 int32_t stoOp = (int32_t)pat[opValue-1];
4948 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC)(void)0;
4949 int32_t frameLoc = URX_VAL(stoOp)((stoOp) & 0xffffff);
4950 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize)(void)0;
4951 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4952 U_ASSERT(prevInputIdx <= fp->fInputIdx)(void)0;
4953 if (prevInputIdx < fp->fInputIdx) {
4954 // The match did make progress. Repeat the loop.
4955 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4956 fp->fPatIdx = opValue;
4957 fp->fExtra[frameLoc] = fp->fInputIdx;
4958 }
4959 // If the input position did not advance, we do nothing here,
4960 // execution will fall out of the loop.
4961 }
4962 break;
4963
4964 case URX_CTR_INIT:
4965 {
4966 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2)(void)0;
4967 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4968
4969 // Pick up the three extra operands that CTR_INIT has, and
4970 // skip the pattern location counter past
4971 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972 fp->fPatIdx += 3;
4973 int32_t loopLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
4974 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976 U_ASSERT(minCount>=0)(void)0;
4977 U_ASSERT(maxCount>=minCount || maxCount==-1)(void)0;
4978 U_ASSERT(loopLoc>=fp->fPatIdx)(void)0;
4979
4980 if (minCount == 0) {
4981 fp = StateSave(fp, loopLoc+1, status);
4982 }
4983 if (maxCount == -1) {
4984 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4985 } else if (maxCount == 0) {
4986 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4987 }
4988 }
4989 break;
4990
4991 case URX_CTR_LOOP:
4992 {
4993 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2)(void)0;
4994 int32_t initOp = (int32_t)pat[opValue];
4995 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT)(void)0;
4996 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff)];
4997 int32_t minCount = (int32_t)pat[opValue+2];
4998 int32_t maxCount = (int32_t)pat[opValue+3];
4999 (*pCounter)++;
5000 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5001 U_ASSERT(*pCounter == maxCount)(void)0;
5002 break;
5003 }
5004 if (*pCounter >= minCount) {
5005 if (maxCount == -1) {
5006 // Loop has no hard upper bound.
5007 // Check that it is progressing through the input, break if it is not.
5008 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff) + 1];
5009 if (fp->fInputIdx == *pLastInputIdx) {
5010 break;
5011 } else {
5012 *pLastInputIdx = fp->fInputIdx;
5013 }
5014 }
5015 fp = StateSave(fp, fp->fPatIdx, status);
5016 } else {
5017 // Increment time-out counter. (StateSave() does it if count >= minCount)
5018 fTickCounter--;
5019 if (fTickCounter <= 0) {
5020 IncrementTime(status); // Re-initializes fTickCounter
5021 }
5022 }
5023 fp->fPatIdx = opValue + 4; // Loop back.
5024 }
5025 break;
5026
5027 case URX_CTR_INIT_NG:
5028 {
5029 // Initialize a non-greedy loop
5030 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2)(void)0;
5031 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5032
5033 // Pick up the three extra operands that CTR_INIT_NG has, and
5034 // skip the pattern location counter past
5035 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5036 fp->fPatIdx += 3;
5037 int32_t loopLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
5038 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5039 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5040 U_ASSERT(minCount>=0)(void)0;
5041 U_ASSERT(maxCount>=minCount || maxCount==-1)(void)0;
5042 U_ASSERT(loopLoc>fp->fPatIdx)(void)0;
5043 if (maxCount == -1) {
5044 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5045 }
5046
5047 if (minCount == 0) {
5048 if (maxCount != 0) {
5049 fp = StateSave(fp, fp->fPatIdx, status);
5050 }
5051 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5052 }
5053 }
5054 break;
5055
5056 case URX_CTR_LOOP_NG:
5057 {
5058 // Non-greedy {min, max} loops
5059 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2)(void)0;
5060 int32_t initOp = (int32_t)pat[opValue];
5061 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG)(void)0;
5062 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff)];
5063 int32_t minCount = (int32_t)pat[opValue+2];
5064 int32_t maxCount = (int32_t)pat[opValue+3];
5065
5066 (*pCounter)++;
5067 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5068 // The loop has matched the maximum permitted number of times.
5069 // Break out of here with no action. Matching will
5070 // continue with the following pattern.
5071 U_ASSERT(*pCounter == maxCount)(void)0;
5072 break;
5073 }
5074
5075 if (*pCounter < minCount) {
5076 // We haven't met the minimum number of matches yet.
5077 // Loop back for another one.
5078 fp->fPatIdx = opValue + 4; // Loop back.
5079 fTickCounter--;
5080 if (fTickCounter <= 0) {
5081 IncrementTime(status); // Re-initializes fTickCounter
5082 }
5083 } else {
5084 // We do have the minimum number of matches.
5085
5086 // If there is no upper bound on the loop iterations, check that the input index
5087 // is progressing, and stop the loop if it is not.
5088 if (maxCount == -1) {
5089 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp)((initOp) & 0xffffff) + 1];
5090 if (fp->fInputIdx == *pLastInputIdx) {
5091 break;
5092 }
5093 *pLastInputIdx = fp->fInputIdx;
5094 }
5095
5096 // Loop Continuation: we will fall into the pattern following the loop
5097 // (non-greedy, don't execute loop body first), but first do
5098 // a state save to the top of the loop, so that a match failure
5099 // in the following pattern will try another iteration of the loop.
5100 fp = StateSave(fp, opValue + 4, status);
5101 }
5102 }
5103 break;
5104
5105 case URX_STO_SP:
5106 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize)(void)0;
5107 fData[opValue] = fStack->size();
5108 break;
5109
5110 case URX_LD_SP:
5111 {
5112 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize)(void)0;
5113 int32_t newStackSize = (int32_t)fData[opValue];
5114 U_ASSERT(newStackSize <= fStack->size())(void)0;
5115 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5116 if (newFP == (int64_t *)fp) {
5117 break;
5118 }
5119 int32_t j;
5120 for (j=0; j<fFrameSize; j++) {
5121 newFP[j] = ((int64_t *)fp)[j];
5122 }
5123 fp = (REStackFrame *)newFP;
5124 fStack->setSize(newStackSize);
5125 }
5126 break;
5127
5128 case URX_BACKREF:
5129 {
5130 U_ASSERT(opValue < fFrameSize)(void)0;
5131 int64_t groupStartIdx = fp->fExtra[opValue];
5132 int64_t groupEndIdx = fp->fExtra[opValue+1];
5133 U_ASSERT(groupStartIdx <= groupEndIdx)(void)0;
5134 int64_t inputIndex = fp->fInputIdx;
5135 if (groupStartIdx < 0) {
5136 // This capture group has not participated in the match thus far,
5137 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5138 break;
5139 }
5140 UBool success = TRUE1;
5141 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5142 if (inputIndex >= fActiveLimit) {
5143 success = FALSE0;
5144 fHitEnd = TRUE1;
5145 break;
5146 }
5147 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5148 success = FALSE0;
5149 break;
5150 }
5151 }
5152 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1])(((inputBuf[groupEndIdx-1])&0xfffffc00)==0xd800) &&
5153 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])(((inputBuf[inputIndex])&0xfffffc00)==0xdc00)) {
5154 // Capture group ended with an unpaired lead surrogate.
5155 // Back reference is not permitted to match lead only of a surrogatge pair.
5156 success = FALSE0;
5157 }
5158 if (success) {
5159 fp->fInputIdx = inputIndex;
5160 } else {
5161 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5162 }
5163 }
5164 break;
5165
5166 case URX_BACKREF_I:
5167 {
5168 U_ASSERT(opValue < fFrameSize)(void)0;
5169 int64_t groupStartIdx = fp->fExtra[opValue];
5170 int64_t groupEndIdx = fp->fExtra[opValue+1];
5171 U_ASSERT(groupStartIdx <= groupEndIdx)(void)0;
5172 if (groupStartIdx < 0) {
5173 // This capture group has not participated in the match thus far,
5174 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5175 break;
5176 }
5177 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5178 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5179
5180 // Note: if the capture group match was of an empty string the backref
5181 // match succeeds. Verified by testing: Perl matches succeed
5182 // in this case, so we do too.
5183
5184 UBool success = TRUE1;
5185 for (;;) {
5186 UChar32 captureGroupChar = captureGroupItr.next();
5187 if (captureGroupChar == U_SENTINEL(-1)) {
5188 success = TRUE1;
5189 break;
5190 }
5191 UChar32 inputChar = inputItr.next();
5192 if (inputChar == U_SENTINEL(-1)) {
5193 success = FALSE0;
5194 fHitEnd = TRUE1;
5195 break;
5196 }
5197 if (inputChar != captureGroupChar) {
5198 success = FALSE0;
5199 break;
5200 }
5201 }
5202
5203 if (success && inputItr.inExpansion()) {
5204 // We obtained a match by consuming part of a string obtained from
5205 // case-folding a single code point of the input text.
5206 // This does not count as an overall match.
5207 success = FALSE0;
5208 }
5209
5210 if (success) {
5211 fp->fInputIdx = inputItr.getIndex();
5212 } else {
5213 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5214 }
5215 }
5216 break;
5217
5218 case URX_STO_INP_LOC:
5219 {
5220 U_ASSERT(opValue >= 0 && opValue < fFrameSize)(void)0;
5221 fp->fExtra[opValue] = fp->fInputIdx;
5222 }
5223 break;
5224
5225 case URX_JMPX:
5226 {
5227 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5228 fp->fPatIdx += 1;
5229 int32_t dataLoc = URX_VAL(pat[instrOperandLoc])((pat[instrOperandLoc]) & 0xffffff);
5230 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize)(void)0;
5231 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5232 U_ASSERT(savedInputIdx <= fp->fInputIdx)(void)0;
5233 if (savedInputIdx < fp->fInputIdx) {
5234 fp->fPatIdx = opValue; // JMP
5235 } else {
5236 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5237 }
5238 }
5239 break;
5240
5241 case URX_LA_START:
5242 {
5243 // Entering a look around block.
5244 // Save Stack Ptr, Input Pos.
5245 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize)(void)0;
5246 fData[opValue] = fStack->size();
5247 fData[opValue+1] = fp->fInputIdx;
5248 fData[opValue+2] = fActiveStart;
5249 fData[opValue+3] = fActiveLimit;
5250 fActiveStart = fLookStart; // Set the match region change for
5251 fActiveLimit = fLookLimit; // transparent bounds.
5252 }
5253 break;
5254
5255 case URX_LA_END:
5256 {
5257 // Leaving a look around block.
5258 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5259 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize)(void)0;
5260 int32_t stackSize = fStack->size();
5261 int32_t newStackSize = (int32_t)fData[opValue];
5262 U_ASSERT(stackSize >= newStackSize)(void)0;
5263 if (stackSize > newStackSize) {
5264 // Copy the current top frame back to the new (cut back) top frame.
5265 // This makes the capture groups from within the look-ahead
5266 // expression available.
5267 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5268 int32_t j;
5269 for (j=0; j<fFrameSize; j++) {
5270 newFP[j] = ((int64_t *)fp)[j];
5271 }
5272 fp = (REStackFrame *)newFP;
5273 fStack->setSize(newStackSize);
5274 }
5275 fp->fInputIdx = fData[opValue+1];
5276
5277 // Restore the active region bounds in the input string; they may have
5278 // been changed because of transparent bounds on a Region.
5279 fActiveStart = fData[opValue+2];
5280 fActiveLimit = fData[opValue+3];
5281 U_ASSERT(fActiveStart >= 0)(void)0;
5282 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
5283 }
5284 break;
5285
5286 case URX_ONECHAR_I:
5287 if (fp->fInputIdx < fActiveLimit) {
5288 UChar32 c;
5289 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c)do { (c)=(inputBuf)[(fp->fInputIdx)++]; if((((c)&0xfffffc00
)==0xd800)) { uint16_t __c2; if((fp->fInputIdx)!=(fActiveLimit
) && (((__c2=(inputBuf)[(fp->fInputIdx)])&0xfffffc00
)==0xdc00)) { ++(fp->fInputIdx); (c)=(((UChar32)((c))<<
10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000));
} } } while (false)
;
5290 if (u_foldCaseu_foldCase_71(c, U_FOLD_CASE_DEFAULT0) == opValue) {
5291 break;
5292 }
5293 } else {
5294 fHitEnd = TRUE1;
5295 }
5296 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5297 break;
5298
5299 case URX_STRING_I:
5300 // Case-insensitive test input against a literal string.
5301 // Strings require two slots in the compiled pattern, one for the
5302 // offset to the string text, and one for the length.
5303 // The compiled string has already been case folded.
5304 {
5305 const UChar *patternString = litText + opValue;
5306
5307 op = (int32_t)pat[fp->fPatIdx];
5308 fp->fPatIdx++;
5309 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
5310 opValue = URX_VAL(op)((op) & 0xffffff);
5311 U_ASSERT(opType == URX_STRING_LEN)(void)0;
5312 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5313
5314 UChar32 cText;
5315 UChar32 cPattern;
5316 UBool success = TRUE1;
5317 int32_t patternStringIdx = 0;
5318 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5319 while (patternStringIdx < patternStringLen) {
5320 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern)do { (cPattern)=(patternString)[(patternStringIdx)++]; if((((
cPattern)&0xfffffc00)==0xd800)) { uint16_t __c2; if((patternStringIdx
)!=(patternStringLen) && (((__c2=(patternString)[(patternStringIdx
)])&0xfffffc00)==0xdc00)) { ++(patternStringIdx); (cPattern
)=(((UChar32)((cPattern))<<10UL)+(UChar32)(__c2)-((0xd800
<<10UL)+0xdc00-0x10000)); } } } while (false)
;
5321 cText = inputIterator.next();
5322 if (cText != cPattern) {
5323 success = FALSE0;
5324 if (cText == U_SENTINEL(-1)) {
5325 fHitEnd = TRUE1;
5326 }
5327 break;
5328 }
5329 }
5330 if (inputIterator.inExpansion()) {
5331 success = FALSE0;
5332 }
5333
5334 if (success) {
5335 fp->fInputIdx = inputIterator.getIndex();
5336 } else {
5337 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5338 }
5339 }
5340 break;
5341
5342 case URX_LB_START:
5343 {
5344 // Entering a look-behind block.
5345 // Save Stack Ptr, Input Pos and active input region.
5346 // TODO: implement transparent bounds. Ticket #6067
5347 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
5348 fData[opValue] = fStack->size();
5349 fData[opValue+1] = fp->fInputIdx;
5350 // Save input string length, then reset to pin any matches to end at
5351 // the current position.
5352 fData[opValue+2] = fActiveStart;
5353 fData[opValue+3] = fActiveLimit;
5354 fActiveStart = fRegionStart;
5355 fActiveLimit = fp->fInputIdx;
5356 // Init the variable containing the start index for attempted matches.
5357 fData[opValue+4] = -1;
5358 }
5359 break;
5360
5361
5362 case URX_LB_CONT:
5363 {
5364 // Positive Look-Behind, at top of loop checking for matches of LB expression
5365 // at all possible input starting positions.
5366
5367 // Fetch the min and max possible match lengths. They are the operands
5368 // of this op in the pattern.
5369 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5370 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5371 U_ASSERT(minML <= maxML)(void)0;
5372 U_ASSERT(minML >= 0)(void)0;
5373
5374 // Fetch (from data) the last input index where a match was attempted.
5375 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
5376 int64_t &lbStartIdx = fData[opValue+4];
5377 if (lbStartIdx < 0) {
5378 // First time through loop.
5379 lbStartIdx = fp->fInputIdx - minML;
5380 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5381 U16_SET_CP_START(inputBuf, 0, lbStartIdx)do { if(((((inputBuf)[lbStartIdx])&0xfffffc00)==0xdc00) &&
(lbStartIdx)>(0) && ((((inputBuf)[(lbStartIdx)-1]
)&0xfffffc00)==0xd800)) { --(lbStartIdx); } } while (false
)
;
5382 }
5383 } else {
5384 // 2nd through nth time through the loop.
5385 // Back up start position for match by one.
5386 if (lbStartIdx == 0) {
5387 lbStartIdx--;
5388 } else {
5389 U16_BACK_1(inputBuf, 0, lbStartIdx)do { if(((((inputBuf)[--(lbStartIdx)])&0xfffffc00)==0xdc00
) && (lbStartIdx)>(0) && ((((inputBuf)[(lbStartIdx
)-1])&0xfffffc00)==0xd800)) { --(lbStartIdx); } } while (
false)
;
5390 }
5391 }
5392
5393 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5394 // We have tried all potential match starting points without
5395 // getting a match. Backtrack out, and out of the
5396 // Look Behind altogether.
5397 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5398 fActiveStart = fData[opValue+2];
5399 fActiveLimit = fData[opValue+3];
5400 U_ASSERT(fActiveStart >= 0)(void)0;
5401 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
5402 break;
5403 }
5404
5405 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5406 // (successful match will fall off the end of the loop.)
5407 fp = StateSave(fp, fp->fPatIdx-3, status);
5408 fp->fInputIdx = lbStartIdx;
5409 }
5410 break;
5411
5412 case URX_LB_END:
5413 // End of a look-behind block, after a successful match.
5414 {
5415 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
5416 if (fp->fInputIdx != fActiveLimit) {
5417 // The look-behind expression matched, but the match did not
5418 // extend all the way to the point that we are looking behind from.
5419 // FAIL out of here, which will take us back to the LB_CONT, which
5420 // will retry the match starting at another position or fail
5421 // the look-behind altogether, whichever is appropriate.
5422 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5423 break;
5424 }
5425
5426 // Look-behind match is good. Restore the original input string region,
5427 // which had been truncated to pin the end of the lookbehind match to the
5428 // position being looked-behind.
5429 fActiveStart = fData[opValue+2];
5430 fActiveLimit = fData[opValue+3];
5431 U_ASSERT(fActiveStart >= 0)(void)0;
5432 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
5433 }
5434 break;
5435
5436
5437 case URX_LBN_CONT:
5438 {
5439 // Negative Look-Behind, at top of loop checking for matches of LB expression
5440 // at all possible input starting positions.
5441
5442 // Fetch the extra parameters of this op.
5443 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5444 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5445 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5446 continueLoc = URX_VAL(continueLoc)((continueLoc) & 0xffffff);
5447 U_ASSERT(minML <= maxML)(void)0;
5448 U_ASSERT(minML >= 0)(void)0;
5449 U_ASSERT(continueLoc > fp->fPatIdx)(void)0;
5450
5451 // Fetch (from data) the last input index where a match was attempted.
5452 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
5453 int64_t &lbStartIdx = fData[opValue+4];
5454 if (lbStartIdx < 0) {
5455 // First time through loop.
5456 lbStartIdx = fp->fInputIdx - minML;
5457 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5458 U16_SET_CP_START(inputBuf, 0, lbStartIdx)do { if(((((inputBuf)[lbStartIdx])&0xfffffc00)==0xdc00) &&
(lbStartIdx)>(0) && ((((inputBuf)[(lbStartIdx)-1]
)&0xfffffc00)==0xd800)) { --(lbStartIdx); } } while (false
)
;
5459 }
5460 } else {
5461 // 2nd through nth time through the loop.
5462 // Back up start position for match by one.
5463 if (lbStartIdx == 0) {
5464 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5465 } else {
5466 U16_BACK_1(inputBuf, 0, lbStartIdx)do { if(((((inputBuf)[--(lbStartIdx)])&0xfffffc00)==0xdc00
) && (lbStartIdx)>(0) && ((((inputBuf)[(lbStartIdx
)-1])&0xfffffc00)==0xd800)) { --(lbStartIdx); } } while (
false)
;
5467 }
5468 }
5469
5470 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5471 // We have tried all potential match starting points without
5472 // getting a match, which means that the negative lookbehind as
5473 // a whole has succeeded. Jump forward to the continue location
5474 fActiveStart = fData[opValue+2];
5475 fActiveLimit = fData[opValue+3];
5476 U_ASSERT(fActiveStart >= 0)(void)0;
5477 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
5478 fp->fPatIdx = continueLoc;
5479 break;
5480 }
5481
5482 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5483 // (successful match will cause a FAIL out of the loop altogether.)
5484 fp = StateSave(fp, fp->fPatIdx-4, status);
5485 fp->fInputIdx = lbStartIdx;
5486 }
5487 break;
5488
5489 case URX_LBN_END:
5490 // End of a negative look-behind block, after a successful match.
5491 {
5492 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize)(void)0;
5493 if (fp->fInputIdx != fActiveLimit) {
5494 // The look-behind expression matched, but the match did not
5495 // extend all the way to the point that we are looking behind from.
5496 // FAIL out of here, which will take us back to the LB_CONT, which
5497 // will retry the match starting at another position or succeed
5498 // the look-behind altogether, whichever is appropriate.
5499 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500 break;
5501 }
5502
5503 // Look-behind expression matched, which means look-behind test as
5504 // a whole Fails
5505
5506 // Restore the original input string length, which had been truncated
5507 // inorder to pin the end of the lookbehind match
5508 // to the position being looked-behind.
5509 fActiveStart = fData[opValue+2];
5510 fActiveLimit = fData[opValue+3];
5511 U_ASSERT(fActiveStart >= 0)(void)0;
5512 U_ASSERT(fActiveLimit <= fInputLength)(void)0;
5513
5514 // Restore original stack position, discarding any state saved
5515 // by the successful pattern match.
5516 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize)(void)0;
5517 int32_t newStackSize = (int32_t)fData[opValue];
5518 U_ASSERT(fStack->size() > newStackSize)(void)0;
5519 fStack->setSize(newStackSize);
5520
5521 // FAIL, which will take control back to someplace
5522 // prior to entering the look-behind test.
5523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5524 }
5525 break;
5526
5527
5528 case URX_LOOP_SR_I:
5529 // Loop Initialization for the optimized implementation of
5530 // [some character set]*
5531 // This op scans through all matching input.
5532 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5533 {
5534 U_ASSERT(opValue > 0 && opValue < fSets->size())(void)0;
5535 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5536 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5537
5538 // Loop through input, until either the input is exhausted or
5539 // we reach a character that is not a member of the set.
5540 int32_t ix = (int32_t)fp->fInputIdx;
5541 for (;;) {
5542 if (ix >= fActiveLimit) {
5543 fHitEnd = TRUE1;
5544 break;
5545 }
5546 UChar32 c;
5547 U16_NEXT(inputBuf, ix, fActiveLimit, c)do { (c)=(inputBuf)[(ix)++]; if((((c)&0xfffffc00)==0xd800
)) { uint16_t __c2; if((ix)!=(fActiveLimit) && (((__c2
=(inputBuf)[(ix)])&0xfffffc00)==0xdc00)) { ++(ix); (c)=((
(UChar32)((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL
)+0xdc00-0x10000)); } } } while (false)
;
5548 if (c<256) {
5549 if (s8->contains(c) == FALSE0) {
5550 U16_BACK_1(inputBuf, 0, ix)do { if(((((inputBuf)[--(ix)])&0xfffffc00)==0xdc00) &&
(ix)>(0) && ((((inputBuf)[(ix)-1])&0xfffffc00
)==0xd800)) { --(ix); } } while (false)
;
5551 break;
5552 }
5553 } else {
5554 if (s->contains(c) == FALSE0) {
5555 U16_BACK_1(inputBuf, 0, ix)do { if(((((inputBuf)[--(ix)])&0xfffffc00)==0xdc00) &&
(ix)>(0) && ((((inputBuf)[(ix)-1])&0xfffffc00
)==0xd800)) { --(ix); } } while (false)
;
5556 break;
5557 }
5558 }
5559 }
5560
5561 // If there were no matching characters, skip over the loop altogether.
5562 // The loop doesn't run at all, a * op always succeeds.
5563 if (ix == fp->fInputIdx) {
5564 fp->fPatIdx++; // skip the URX_LOOP_C op.
5565 break;
5566 }
5567
5568 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5569 // must follow. It's operand is the stack location
5570 // that holds the starting input index for the match of this [set]*
5571 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5572 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C)(void)0;
5573 int32_t stackLoc = URX_VAL(loopcOp)((loopcOp) & 0xffffff);
5574 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize)(void)0;
5575 fp->fExtra[stackLoc] = fp->fInputIdx;
5576 fp->fInputIdx = ix;
5577
5578 // Save State to the URX_LOOP_C op that follows this one,
5579 // so that match failures in the following code will return to there.
5580 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5581 fp = StateSave(fp, fp->fPatIdx, status);
5582 fp->fPatIdx++;
5583 }
5584 break;
5585
5586
5587 case URX_LOOP_DOT_I:
5588 // Loop Initialization for the optimized implementation of .*
5589 // This op scans through all remaining input.
5590 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5591 {
5592 // Loop through input until the input is exhausted (we reach an end-of-line)
5593 // In DOTALL mode, we can just go straight to the end of the input.
5594 int32_t ix;
5595 if ((opValue & 1) == 1) {
5596 // Dot-matches-All mode. Jump straight to the end of the string.
5597 ix = (int32_t)fActiveLimit;
5598 fHitEnd = TRUE1;
5599 } else {
5600 // NOT DOT ALL mode. Line endings do not match '.'
5601 // Scan forward until a line ending or end of input.
5602 ix = (int32_t)fp->fInputIdx;
5603 for (;;) {
5604 if (ix >= fActiveLimit) {
5605 fHitEnd = TRUE1;
5606 break;
5607 }
5608 UChar32 c;
5609 U16_NEXT(inputBuf, ix, fActiveLimit, c)do { (c)=(inputBuf)[(ix)++]; if((((c)&0xfffffc00)==0xd800
)) { uint16_t __c2; if((ix)!=(fActiveLimit) && (((__c2
=(inputBuf)[(ix)])&0xfffffc00)==0xdc00)) { ++(ix); (c)=((
(UChar32)((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL
)+0xdc00-0x10000)); } } } while (false)
; // c = inputBuf[ix++]
5610 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5611 if ((c == 0x0a) || // 0x0a is newline in both modes.
5612 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5613 isLineTerminator(c))) {
5614 // char is a line ending. Put the input pos back to the
5615 // line ending char, and exit the scanning loop.
5616 U16_BACK_1(inputBuf, 0, ix)do { if(((((inputBuf)[--(ix)])&0xfffffc00)==0xdc00) &&
(ix)>(0) && ((((inputBuf)[(ix)-1])&0xfffffc00
)==0xd800)) { --(ix); } } while (false)
;
5617 break;
5618 }
5619 }
5620 }
5621 }
5622
5623 // If there were no matching characters, skip over the loop altogether.
5624 // The loop doesn't run at all, a * op always succeeds.
5625 if (ix == fp->fInputIdx) {
5626 fp->fPatIdx++; // skip the URX_LOOP_C op.
5627 break;
5628 }
5629
5630 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5631 // must follow. It's operand is the stack location
5632 // that holds the starting input index for the match of this .*
5633 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5634 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C)(void)0;
5635 int32_t stackLoc = URX_VAL(loopcOp)((loopcOp) & 0xffffff);
5636 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize)(void)0;
5637 fp->fExtra[stackLoc] = fp->fInputIdx;
5638 fp->fInputIdx = ix;
5639
5640 // Save State to the URX_LOOP_C op that follows this one,
5641 // so that match failures in the following code will return to there.
5642 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5643 fp = StateSave(fp, fp->fPatIdx, status);
5644 fp->fPatIdx++;
5645 }
5646 break;
5647
5648
5649 case URX_LOOP_C:
5650 {
5651 U_ASSERT(opValue>=0 && opValue<fFrameSize)(void)0;
5652 backSearchIndex = (int32_t)fp->fExtra[opValue];
5653 U_ASSERT(backSearchIndex <= fp->fInputIdx)(void)0;
5654 if (backSearchIndex == fp->fInputIdx) {
5655 // We've backed up the input idx to the point that the loop started.
5656 // The loop is done. Leave here without saving state.
5657 // Subsequent failures won't come back here.
5658 break;
5659 }
5660 // Set up for the next iteration of the loop, with input index
5661 // backed up by one from the last time through,
5662 // and a state save to this instruction in case the following code fails again.
5663 // (We're going backwards because this loop emulates stack unwinding, not
5664 // the initial scan forward.)
5665 U_ASSERT(fp->fInputIdx > 0)(void)0;
5666 UChar32 prevC;
5667 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC)do { (prevC)=(inputBuf)[--(fp->fInputIdx)]; if((((prevC)&
0xfffffc00)==0xdc00)) { uint16_t __c2; if((fp->fInputIdx)>
(0) && (((__c2=(inputBuf)[(fp->fInputIdx)-1])&
0xfffffc00)==0xd800)) { --(fp->fInputIdx); (prevC)=(((UChar32
)(__c2)<<10UL)+(UChar32)((prevC))-((0xd800<<10UL)
+0xdc00-0x10000)); } } } while (false)
; // !!!: should this 0 be one of f*Limit?
5668
5669 if (prevC == 0x0a &&
5670 fp->fInputIdx > backSearchIndex &&
5671 inputBuf[fp->fInputIdx-1] == 0x0d) {
5672 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5673 if (URX_TYPE(prevOp)((uint32_t)(prevOp) >> 24) == URX_LOOP_DOT_I) {
5674 // .*, stepping back over CRLF pair.
5675 U16_BACK_1(inputBuf, 0, fp->fInputIdx)do { if(((((inputBuf)[--(fp->fInputIdx)])&0xfffffc00)==
0xdc00) && (fp->fInputIdx)>(0) && ((((inputBuf
)[(fp->fInputIdx)-1])&0xfffffc00)==0xd800)) { --(fp->
fInputIdx); } } while (false)
;
5676 }
5677 }
5678
5679
5680 fp = StateSave(fp, fp->fPatIdx-1, status);
5681 }
5682 break;
5683
5684
5685
5686 default:
5687 // Trouble. The compiled pattern contains an entry with an
5688 // unrecognized type tag.
5689 UPRV_UNREACHABLE_ASSERT(void)0;
5690 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5691 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5692 // See ICU-21669.
5693 status = U_INTERNAL_PROGRAM_ERROR;
5694 }
5695
5696 if (U_FAILURE(status)) {
5697 isMatch = FALSE0;
5698 break;
5699 }
5700 }
5701
5702breakFromLoop:
5703 fMatch = isMatch;
5704 if (isMatch) {
5705 fLastMatchEnd = fMatchEnd;
5706 fMatchStart = startIdx;
5707 fMatchEnd = fp->fInputIdx;
5708 }
5709
5710#ifdef REGEX_RUN_DEBUG
5711 if (fTraceDebug) {
5712 if (isMatch) {
5713 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5714 } else {
5715 printf("No match\n\n");
5716 }
5717 }
5718#endif
5719
5720 fFrame = fp; // The active stack frame when the engine stopped.
5721 // Contains the capture group results that we need to
5722 // access later.
5723
5724 return;
5725}
5726
5727
5728UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)UClassID RegexMatcher::getStaticClassID() { static char classID
= 0; return (UClassID)&classID; } UClassID RegexMatcher::
getDynamicClassID() const { return RegexMatcher::getStaticClassID
(); }
5729
5730U_NAMESPACE_END}
5731
5732#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5733