Bug Summary

File:out/../deps/icu-small/source/common/uniset_props.cpp
Warning:line 385, column 17
Value stored to 'lastItem' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name uniset_props.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/common/uniset_props.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uniset_props.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug25
16* created by: Markus W. Scherer
17*
18* Character property dependent functions moved here from uniset.cpp
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uniset.h"
23#include "unicode/parsepos.h"
24#include "unicode/uchar.h"
25#include "unicode/uscript.h"
26#include "unicode/symtable.h"
27#include "unicode/uset.h"
28#include "unicode/locid.h"
29#include "unicode/brkiter.h"
30#include "uset_imp.h"
31#include "ruleiter.h"
32#include "cmemory.h"
33#include "ucln_cmn.h"
34#include "util.h"
35#include "uvector.h"
36#include "uprops.h"
37#include "propname.h"
38#include "normalizer2impl.h"
39#include "uinvchar.h"
40#include "uprops.h"
41#include "charstr.h"
42#include "cstring.h"
43#include "mutex.h"
44#include "umutex.h"
45#include "uassert.h"
46#include "hash.h"
47
48U_NAMESPACE_USEusing namespace icu_71;
49
50// Special property set IDs
51static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
52static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54
55// Unicode name property alias
56#define NAME_PROP"na" "na"
57#define NAME_PROP_LENGTH2 2
58
59// Cached sets ------------------------------------------------------------- ***
60
61U_CDECL_BEGINextern "C" {
62static UBool U_CALLCONV uset_cleanup();
63
64static UnicodeSet *uni32Singleton;
65static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER{{ 0 }, U_ZERO_ERROR};
66
67/**
68 * Cleanup function for UnicodeSet
69 */
70static UBool U_CALLCONV uset_cleanup(void) {
71 delete uni32Singleton;
72 uni32Singleton = NULL__null;
73 uni32InitOnce.reset();
74 return TRUE1;
75}
76
77U_CDECL_END}
78
79U_NAMESPACE_BEGINnamespace icu_71 {
80
81namespace {
82
83// Cache some sets for other services -------------------------------------- ***
84void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85 U_ASSERT(uni32Singleton == NULL)(void)0;
86 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]")icu::UnicodeString(true, u"[:age=3.2:]", -1), errorCode);
87 if(uni32Singleton==NULL__null) {
88 errorCode=U_MEMORY_ALLOCATION_ERROR;
89 } else {
90 uni32Singleton->freeze();
91 }
92 ucln_common_registerCleanupucln_common_registerCleanup_71(UCLN_COMMON_USET, uset_cleanup);
93}
94
95
96U_CFUNCextern "C" UnicodeSet *
97uniset_getUnicode32Instanceuniset_getUnicode32Instance_71(UErrorCode &errorCode) {
98 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99 return uni32Singleton;
100}
101
102// helper functions for matching of pattern syntax pieces ------------------ ***
103// these functions are parallel to the PERL_OPEN etc. strings above
104
105// using these functions is not only faster than UnicodeString::compare() and
106// caseCompare(), but they also make UnicodeSet work for simple patterns when
107// no Unicode properties data is available - when caseCompare() fails
108
109static inline UBool
110isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111 UChar c;
112 return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
113}
114
115/*static inline UBool
116isPerlClose(const UnicodeString &pattern, int32_t pos) {
117 return pattern.charAt(pos)==u'}';
118}*/
119
120static inline UBool
121isNameOpen(const UnicodeString &pattern, int32_t pos) {
122 return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
123}
124
125static inline UBool
126isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127 return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
128}
129
130/*static inline UBool
131isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132 return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133}*/
134
135// TODO memory debugging provided inside uniset.cpp
136// could be made available here but probably obsolete with use of modern
137// memory leak checker tools
138#define _dbgct(me)
139
140} // namespace
141
142//----------------------------------------------------------------
143// Constructors &c
144//----------------------------------------------------------------
145
146/**
147 * Constructs a set from the given pattern, optionally ignoring
148 * white space. See the class description for the syntax of the
149 * pattern language.
150 * @param pattern a string specifying what characters are in the set
151 */
152UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153 UErrorCode& status) {
154 applyPattern(pattern, status);
155 _dbgct(this);
156}
157
158//----------------------------------------------------------------
159// Public API
160//----------------------------------------------------------------
161
162UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163 UErrorCode& status) {
164 // Equivalent to
165 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
166 // but without dependency on closeOver().
167 ParsePosition pos(0);
168 applyPatternIgnoreSpace(pattern, pos, NULL__null, status);
169 if (U_FAILURE(status)) return *this;
170
171 int32_t i = pos.getIndex();
172 // Skip over trailing whitespace
173 ICU_Utility::skipWhitespace(pattern, i, TRUE1);
174 if (i != pattern.length()) {
175 status = U_ILLEGAL_ARGUMENT_ERROR;
176 }
177 return *this;
178}
179
180void
181UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182 ParsePosition& pos,
183 const SymbolTable* symbols,
184 UErrorCode& status) {
185 if (U_FAILURE(status)) {
186 return;
187 }
188 if (isFrozen()) {
189 status = U_NO_WRITE_PERMISSION;
190 return;
191 }
192 // Need to build the pattern in a temporary string because
193 // _applyPattern calls add() etc., which set pat to empty.
194 UnicodeString rebuiltPat;
195 RuleCharacterIterator chars(pattern, symbols, pos);
196 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL__null, 0, status);
197 if (U_FAILURE(status)) return;
198 if (chars.inVariable()) {
199 // syntaxError(chars, "Extra chars in variable value");
200 status = U_MALFORMED_SET;
201 return;
202 }
203 setPattern(rebuiltPat);
204}
205
206/**
207 * Return true if the given position, in the given pattern, appears
208 * to be the start of a UnicodeSet pattern.
209 */
210UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211 return ((pos+1) < pattern.length() &&
212 pattern.charAt(pos) == (UChar)91/*[*/) ||
213 resemblesPropertyPattern(pattern, pos);
214}
215
216//----------------------------------------------------------------
217// Implementation: Pattern parsing
218//----------------------------------------------------------------
219
220namespace {
221
222/**
223 * A small all-inline class to manage a UnicodeSet pointer. Add
224 * operator->() etc. as needed.
225 */
226class UnicodeSetPointer {
227 UnicodeSet* p;
228public:
229 inline UnicodeSetPointer() : p(0) {}
230 inline ~UnicodeSetPointer() { delete p; }
231 inline UnicodeSet* pointer() { return p; }
232 inline UBool allocate() {
233 if (p == 0) {
234 p = new UnicodeSet();
235 }
236 return p != 0;
237 }
238};
239
240constexpr int32_t MAX_DEPTH = 100;
241
242} // namespace
243
244/**
245 * Parse the pattern from the given RuleCharacterIterator. The
246 * iterator is advanced over the parsed pattern.
247 * @param chars iterator over the pattern characters. Upon return
248 * it will be advanced to the first character after the parsed
249 * pattern, or the end of the iteration if all characters are
250 * parsed.
251 * @param symbols symbol table to use to parse and dereference
252 * variables, or null if none.
253 * @param rebuiltPat the pattern that was parsed, rebuilt or
254 * copied from the input pattern, as appropriate.
255 * @param options a bit mask of zero or more of the following:
256 * IGNORE_SPACE, CASE.
257 */
258void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259 const SymbolTable* symbols,
260 UnicodeString& rebuiltPat,
261 uint32_t options,
262 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263 int32_t depth,
264 UErrorCode& ec) {
265 if (U_FAILURE(ec)) return;
266 if (depth > MAX_DEPTH) {
267 ec = U_ILLEGAL_ARGUMENT_ERROR;
268 return;
269 }
270
271 // Syntax characters: [ ] ^ - & { }
272
273 // Recognized special forms for chars, sets: c-c s-s s&s
274
275 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276 RuleCharacterIterator::PARSE_ESCAPES;
277 if ((options & USET_IGNORE_SPACE) != 0) {
278 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279 }
280
281 UnicodeString patLocal, buf;
282 UBool usePat = FALSE0;
283 UnicodeSetPointer scratch;
284 RuleCharacterIterator::Pos backup;
285
286 // mode: 0=before [, 1=between [...], 2=after ]
287 // lastItem: 0=none, 1=char, 2=set
288 int8_t lastItem = 0, mode = 0;
289 UChar32 lastChar = 0;
290 UChar op = 0;
291
292 UBool invert = FALSE0;
293
294 clear();
295
296 while (mode != 2 && !chars.atEnd()) {
297 U_ASSERT((lastItem == 0 && op == 0) ||(void)0
298 (lastItem == 1 && (op == 0 || op == u'-')) ||(void)0
299 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')))(void)0;
300
301 UChar32 c = 0;
302 UBool literal = FALSE0;
303 UnicodeSet* nested = 0; // alias - do not delete
304
305 // -------- Check for property pattern
306
307 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308 int8_t setMode = 0;
309 if (resemblesPropertyPattern(chars, opts)) {
310 setMode = 2;
311 }
312
313 // -------- Parse '[' of opening delimiter OR nested set.
314 // If there is a nested set, use `setMode' to define how
315 // the set should be parsed. If the '[' is part of the
316 // opening delimiter for this pattern, parse special
317 // strings "[", "[^", "[-", and "[^-". Check for stand-in
318 // characters representing a nested set in the symbol
319 // table.
320
321 else {
322 // Prepare to backup if necessary
323 chars.getPos(backup);
324 c = chars.next(opts, literal, ec);
325 if (U_FAILURE(ec)) return;
326
327 if (c == u'[' && !literal) {
328 if (mode == 1) {
329 chars.setPos(backup); // backup
330 setMode = 1;
331 } else {
332 // Handle opening '[' delimiter
333 mode = 1;
334 patLocal.append(u'[');
335 chars.getPos(backup); // prepare to backup
336 c = chars.next(opts, literal, ec);
337 if (U_FAILURE(ec)) return;
338 if (c == u'^' && !literal) {
339 invert = TRUE1;
340 patLocal.append(u'^');
341 chars.getPos(backup); // prepare to backup
342 c = chars.next(opts, literal, ec);
343 if (U_FAILURE(ec)) return;
344 }
345 // Fall through to handle special leading '-';
346 // otherwise restart loop for nested [], \p{}, etc.
347 if (c == u'-') {
348 literal = TRUE1;
349 // Fall through to handle literal '-' below
350 } else {
351 chars.setPos(backup); // backup
352 continue;
353 }
354 }
355 } else if (symbols != 0) {
356 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357 if (m != 0) {
358 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359 if (ms == NULL__null) {
360 ec = U_MALFORMED_SET;
361 return;
362 }
363 // casting away const, but `nested' won't be modified
364 // (important not to modify stored set)
365 nested = const_cast<UnicodeSet*>(ms);
366 setMode = 3;
367 }
368 }
369 }
370
371 // -------- Handle a nested set. This either is inline in
372 // the pattern or represented by a stand-in that has
373 // previously been parsed and was looked up in the symbol
374 // table.
375
376 if (setMode != 0) {
377 if (lastItem == 1) {
378 if (op != 0) {
379 // syntaxError(chars, "Char expected after operator");
380 ec = U_MALFORMED_SET;
381 return;
382 }
383 add(lastChar, lastChar);
384 _appendToPat(patLocal, lastChar, FALSE0);
385 lastItem = 0;
Value stored to 'lastItem' is never read
386 op = 0;
387 }
388
389 if (op == u'-' || op == u'&') {
390 patLocal.append(op);
391 }
392
393 if (nested == 0) {
394 // lazy allocation
395 if (!scratch.allocate()) {
396 ec = U_MEMORY_ALLOCATION_ERROR;
397 return;
398 }
399 nested = scratch.pointer();
400 }
401 switch (setMode) {
402 case 1:
403 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
404 break;
405 case 2:
406 chars.skipIgnored(opts);
407 nested->applyPropertyPattern(chars, patLocal, ec);
408 if (U_FAILURE(ec)) return;
409 break;
410 case 3: // `nested' already parsed
411 nested->_toPattern(patLocal, FALSE0);
412 break;
413 }
414
415 usePat = TRUE1;
416
417 if (mode == 0) {
418 // Entire pattern is a category; leave parse loop
419 *this = *nested;
420 mode = 2;
421 break;
422 }
423
424 switch (op) {
425 case u'-':
426 removeAll(*nested);
427 break;
428 case u'&':
429 retainAll(*nested);
430 break;
431 case 0:
432 addAll(*nested);
433 break;
434 }
435
436 op = 0;
437 lastItem = 2;
438
439 continue;
440 }
441
442 if (mode == 0) {
443 // syntaxError(chars, "Missing '['");
444 ec = U_MALFORMED_SET;
445 return;
446 }
447
448 // -------- Parse special (syntax) characters. If the
449 // current character is not special, or if it is escaped,
450 // then fall through and handle it below.
451
452 if (!literal) {
453 switch (c) {
454 case u']':
455 if (lastItem == 1) {
456 add(lastChar, lastChar);
457 _appendToPat(patLocal, lastChar, FALSE0);
458 }
459 // Treat final trailing '-' as a literal
460 if (op == u'-') {
461 add(op, op);
462 patLocal.append(op);
463 } else if (op == u'&') {
464 // syntaxError(chars, "Trailing '&'");
465 ec = U_MALFORMED_SET;
466 return;
467 }
468 patLocal.append(u']');
469 mode = 2;
470 continue;
471 case u'-':
472 if (op == 0) {
473 if (lastItem != 0) {
474 op = (UChar) c;
475 continue;
476 } else {
477 // Treat final trailing '-' as a literal
478 add(c, c);
479 c = chars.next(opts, literal, ec);
480 if (U_FAILURE(ec)) return;
481 if (c == u']' && !literal) {
482 patLocal.append(u"-]", 2);
483 mode = 2;
484 continue;
485 }
486 }
487 }
488 // syntaxError(chars, "'-' not after char or set");
489 ec = U_MALFORMED_SET;
490 return;
491 case u'&':
492 if (lastItem == 2 && op == 0) {
493 op = (UChar) c;
494 continue;
495 }
496 // syntaxError(chars, "'&' not after set");
497 ec = U_MALFORMED_SET;
498 return;
499 case u'^':
500 // syntaxError(chars, "'^' not after '['");
501 ec = U_MALFORMED_SET;
502 return;
503 case u'{':
504 if (op != 0) {
505 // syntaxError(chars, "Missing operand after operator");
506 ec = U_MALFORMED_SET;
507 return;
508 }
509 if (lastItem == 1) {
510 add(lastChar, lastChar);
511 _appendToPat(patLocal, lastChar, FALSE0);
512 }
513 lastItem = 0;
514 buf.truncate(0);
515 {
516 UBool ok = FALSE0;
517 while (!chars.atEnd()) {
518 c = chars.next(opts, literal, ec);
519 if (U_FAILURE(ec)) return;
520 if (c == u'}' && !literal) {
521 ok = TRUE1;
522 break;
523 }
524 buf.append(c);
525 }
526 if (!ok) {
527 // syntaxError(chars, "Invalid multicharacter string");
528 ec = U_MALFORMED_SET;
529 return;
530 }
531 }
532 // We have new string. Add it to set and continue;
533 // we don't need to drop through to the further
534 // processing
535 add(buf);
536 patLocal.append(u'{');
537 _appendToPat(patLocal, buf, FALSE0);
538 patLocal.append(u'}');
539 continue;
540 case SymbolTable::SYMBOL_REF:
541 // symbols nosymbols
542 // [a-$] error error (ambiguous)
543 // [a$] anchor anchor
544 // [a-$x] var "x"* literal '$'
545 // [a-$.] error literal '$'
546 // *We won't get here in the case of var "x"
547 {
548 chars.getPos(backup);
549 c = chars.next(opts, literal, ec);
550 if (U_FAILURE(ec)) return;
551 UBool anchor = (c == u']' && !literal);
552 if (symbols == 0 && !anchor) {
553 c = SymbolTable::SYMBOL_REF;
554 chars.setPos(backup);
555 break; // literal '$'
556 }
557 if (anchor && op == 0) {
558 if (lastItem == 1) {
559 add(lastChar, lastChar);
560 _appendToPat(patLocal, lastChar, FALSE0);
561 }
562 add(U_ETHER((char16_t)0xFFFF));
563 usePat = TRUE1;
564 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
565 patLocal.append(u']');
566 mode = 2;
567 continue;
568 }
569 // syntaxError(chars, "Unquoted '$'");
570 ec = U_MALFORMED_SET;
571 return;
572 }
573 default:
574 break;
575 }
576 }
577
578 // -------- Parse literal characters. This includes both
579 // escaped chars ("\u4E01") and non-syntax characters
580 // ("a").
581
582 switch (lastItem) {
583 case 0:
584 lastItem = 1;
585 lastChar = c;
586 break;
587 case 1:
588 if (op == u'-') {
589 if (lastChar >= c) {
590 // Don't allow redundant (a-a) or empty (b-a) ranges;
591 // these are most likely typos.
592 // syntaxError(chars, "Invalid range");
593 ec = U_MALFORMED_SET;
594 return;
595 }
596 add(lastChar, c);
597 _appendToPat(patLocal, lastChar, FALSE0);
598 patLocal.append(op);
599 _appendToPat(patLocal, c, FALSE0);
600 lastItem = 0;
601 op = 0;
602 } else {
603 add(lastChar, lastChar);
604 _appendToPat(patLocal, lastChar, FALSE0);
605 lastChar = c;
606 }
607 break;
608 case 2:
609 if (op != 0) {
610 // syntaxError(chars, "Set expected after operator");
611 ec = U_MALFORMED_SET;
612 return;
613 }
614 lastChar = c;
615 lastItem = 1;
616 break;
617 }
618 }
619
620 if (mode != 2) {
621 // syntaxError(chars, "Missing ']'");
622 ec = U_MALFORMED_SET;
623 return;
624 }
625
626 chars.skipIgnored(opts);
627
628 /**
629 * Handle global flags (invert, case insensitivity). If this
630 * pattern should be compiled case-insensitive, then we need
631 * to close over case BEFORE COMPLEMENTING. This makes
632 * patterns like /[^abc]/i work.
633 */
634 if ((options & USET_CASE_INSENSITIVE) != 0) {
635 (this->*caseClosure)(USET_CASE_INSENSITIVE);
636 }
637 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
638 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
639 }
640 if (invert) {
641 complement().removeAllStrings(); // code point complement
642 }
643
644 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
645 // generated pattern.
646 if (usePat) {
647 rebuiltPat.append(patLocal);
648 } else {
649 _generatePattern(rebuiltPat, FALSE0);
650 }
651 if (isBogus() && U_SUCCESS(ec)) {
652 // We likely ran out of memory. AHHH!
653 ec = U_MEMORY_ALLOCATION_ERROR;
654 }
655}
656
657//----------------------------------------------------------------
658// Property set implementation
659//----------------------------------------------------------------
660
661namespace {
662
663static UBool numericValueFilter(UChar32 ch, void* context) {
664 return u_getNumericValueu_getNumericValue_71(ch) == *(double*)context;
665}
666
667static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
668 int32_t value = *(int32_t*)context;
669 return (U_GET_GC_MASK((UChar32) ch)((uint32_t)1<<(u_charType_71((UChar32) ch))) & value) != 0;
670}
671
672static UBool versionFilter(UChar32 ch, void* context) {
673 static const UVersionInfo none = { 0, 0, 0, 0 };
674 UVersionInfo v;
675 u_charAgeu_charAge_71(ch, v);
676 UVersionInfo* version = (UVersionInfo*)context;
677 return uprv_memcmp(&v, &none, sizeof(v)):: memcmp(&v, &none,sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)):: memcmp(&v, version,sizeof(v)) <= 0;
678}
679
680typedef struct {
681 UProperty prop;
682 int32_t value;
683} IntPropertyContext;
684
685static UBool intPropertyFilter(UChar32 ch, void* context) {
686 IntPropertyContext* c = (IntPropertyContext*)context;
687 return u_getIntPropertyValueu_getIntPropertyValue_71((UChar32) ch, c->prop) == c->value;
688}
689
690static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
691 return uscript_hasScriptuscript_hasScript_71(ch, *(UScriptCode*)context);
692}
693
694} // namespace
695
696/**
697 * Generic filter-based scanning code for UCD property UnicodeSets.
698 */
699void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
700 void* context,
701 const UnicodeSet* inclusions,
702 UErrorCode &status) {
703 if (U_FAILURE(status)) return;
704
705 // Logically, walk through all Unicode characters, noting the start
706 // and end of each range for which filter.contain(c) is
707 // true. Add each range to a set.
708 //
709 // To improve performance, use an inclusions set which
710 // encodes information about character ranges that are known
711 // to have identical properties.
712 // inclusions contains the first characters of
713 // same-value ranges for the given property.
714
715 clear();
716
717 UChar32 startHasProperty = -1;
718 int32_t limitRange = inclusions->getRangeCount();
719
720 for (int j=0; j<limitRange; ++j) {
721 // get current range
722 UChar32 start = inclusions->getRangeStart(j);
723 UChar32 end = inclusions->getRangeEnd(j);
724
725 // for all the code points in the range, process
726 for (UChar32 ch = start; ch <= end; ++ch) {
727 // only add to this UnicodeSet on inflection points --
728 // where the hasProperty value changes to false
729 if ((*filter)(ch, context)) {
730 if (startHasProperty < 0) {
731 startHasProperty = ch;
732 }
733 } else if (startHasProperty >= 0) {
734 add(startHasProperty, ch-1);
735 startHasProperty = -1;
736 }
737 }
738 }
739 if (startHasProperty >= 0) {
740 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
741 }
742 if (isBogus() && U_SUCCESS(status)) {
743 // We likely ran out of memory. AHHH!
744 status = U_MEMORY_ALLOCATION_ERROR;
745 }
746}
747
748namespace {
749
750static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
751 /* Note: we use ' ' in compiler code page */
752 int32_t j = 0;
753 char ch;
754 --dstCapacity; /* make room for term. zero */
755 while ((ch = *src++) != 0) {
756 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
757 continue;
758 }
759 if (j >= dstCapacity) return FALSE0;
760 dst[j++] = ch;
761 }
762 if (j > 0 && dst[j-1] == ' ') --j;
763 dst[j] = 0;
764 return TRUE1;
765}
766
767} // namespace
768
769//----------------------------------------------------------------
770// Property set API
771//----------------------------------------------------------------
772
773#define FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
UPRV_BLOCK_MACRO_BEGINdo { \
774 ec=U_ILLEGAL_ARGUMENT_ERROR; \
775 return *this; \
776} UPRV_BLOCK_MACRO_ENDwhile (false)
777
778UnicodeSet&
779UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
780 if (U_FAILURE(ec) || isFrozen()) { return *this; }
781 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
782 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
784 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
785 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
786 UScriptCode script = (UScriptCode)value;
787 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
788 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
789 if (value == 0 || value == 1) {
790 const USet *set = u_getBinaryPropertySetu_getBinaryPropertySet_71(prop, &ec);
791 if (U_FAILURE(ec)) { return *this; }
792 copyFrom(*UnicodeSet::fromUSet(set), TRUE1);
793 if (value == 0) {
794 complement().removeAllStrings(); // code point complement
795 }
796 } else {
797 clear();
798 }
799 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
800 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
801 IntPropertyContext c = {prop, value};
802 applyFilter(intPropertyFilter, &c, inclusions, ec);
803 } else {
804 ec = U_ILLEGAL_ARGUMENT_ERROR;
805 }
806 return *this;
807}
808
809UnicodeSet&
810UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
811 const UnicodeString& value,
812 UErrorCode& ec) {
813 if (U_FAILURE(ec) || isFrozen()) return *this;
814
815 // prop and value used to be converted to char * using the default
816 // converter instead of the invariant conversion.
817 // This should not be necessary because all Unicode property and value
818 // names use only invariant characters.
819 // If there are any variant characters, then we won't find them anyway.
820 // Checking first avoids assertion failures in the conversion.
821 if( !uprv_isInvariantUStringuprv_isInvariantUString_71(prop.getBuffer(), prop.length()) ||
822 !uprv_isInvariantUStringuprv_isInvariantUString_71(value.getBuffer(), value.length())
823 ) {
824 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
825 }
826 CharString pname, vname;
827 pname.appendInvariantChars(prop, ec);
828 vname.appendInvariantChars(value, ec);
829 if (U_FAILURE(ec)) return *this;
830
831 UProperty p;
832 int32_t v;
833 UBool invert = FALSE0;
834
835 if (value.length() > 0) {
836 p = u_getPropertyEnumu_getPropertyEnum_71(pname.data());
837 if (p == UCHAR_INVALID_CODE) FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
838
839 // Treat gc as gcm
840 if (p == UCHAR_GENERAL_CATEGORY) {
841 p = UCHAR_GENERAL_CATEGORY_MASK;
842 }
843
844 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
845 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
846 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
847 v = u_getPropertyValueEnumu_getPropertyValueEnum_71(p, vname.data());
848 if (v == UCHAR_INVALID_CODE) {
849 // Handle numeric CCC
850 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
851 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
852 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
853 char* end;
854 double val = uprv_strtod(vname.data(), &end):: strtod(vname.data(), &end);
855 // Anything between 0 and 255 is valid even if unused.
856 // Cast double->int only after range check.
857 // We catch NaN here because comparing it with both 0 and 255 will be false
858 // (as are all comparisons with NaN).
859 if (*end != 0 || !(0 <= val && val <= 255) ||
860 (v = (int32_t)val) != val) {
861 // non-integral value or outside 0..255, or trailing junk
862 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
863 }
864 } else {
865 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
866 }
867 }
868 }
869
870 else {
871
872 switch (p) {
873 case UCHAR_NUMERIC_VALUE:
874 {
875 char* end;
876 double val = uprv_strtod(vname.data(), &end):: strtod(vname.data(), &end);
877 if (*end != 0) {
878 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
879 }
880 applyFilter(numericValueFilter, &val,
881 CharacterProperties::getInclusionsForProperty(p, ec), ec);
882 return *this;
883 }
884 case UCHAR_NAME:
885 {
886 // Must munge name, since u_charFromName() does not do
887 // 'loose' matching.
888 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
889 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
890 UChar32 ch = u_charFromNameu_charFromName_71(U_EXTENDED_CHAR_NAME, buf, &ec);
891 if (U_SUCCESS(ec)) {
892 clear();
893 add(ch);
894 return *this;
895 } else {
896 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
897 }
898 }
899 case UCHAR_UNICODE_1_NAME:
900 // ICU 49 deprecates the Unicode_1_Name property APIs.
901 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
902 case UCHAR_AGE:
903 {
904 // Must munge name, since u_versionFromString() does not do
905 // 'loose' matching.
906 char buf[128];
907 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
908 UVersionInfo version;
909 u_versionFromStringu_versionFromString_71(version, buf);
910 applyFilter(versionFilter, &version,
911 CharacterProperties::getInclusionsForProperty(p, ec), ec);
912 return *this;
913 }
914 case UCHAR_SCRIPT_EXTENSIONS:
915 v = u_getPropertyValueEnumu_getPropertyValueEnum_71(UCHAR_SCRIPT, vname.data());
916 if (v == UCHAR_INVALID_CODE) {
917 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
918 }
919 // fall through to calling applyIntPropertyValue()
920 break;
921 default:
922 // p is a non-binary, non-enumerated property that we
923 // don't support (yet).
924 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
925 }
926 }
927 }
928
929 else {
930 // value is empty. Interpret as General Category, Script, or
931 // Binary property.
932 p = UCHAR_GENERAL_CATEGORY_MASK;
933 v = u_getPropertyValueEnumu_getPropertyValueEnum_71(p, pname.data());
934 if (v == UCHAR_INVALID_CODE) {
935 p = UCHAR_SCRIPT;
936 v = u_getPropertyValueEnumu_getPropertyValueEnum_71(p, pname.data());
937 if (v == UCHAR_INVALID_CODE) {
938 p = u_getPropertyEnumu_getPropertyEnum_71(pname.data());
939 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
940 v = 1;
941 } else if (0 == uprv_comparePropertyNamesuprv_compareASCIIPropertyNames_71(ANY, pname.data())) {
942 set(MIN_VALUE, MAX_VALUE);
943 return *this;
944 } else if (0 == uprv_comparePropertyNamesuprv_compareASCIIPropertyNames_71(ASCII, pname.data())) {
945 set(0, 0x7F);
946 return *this;
947 } else if (0 == uprv_comparePropertyNamesuprv_compareASCIIPropertyNames_71(ASSIGNED, pname.data())) {
948 // [:Assigned:]=[:^Cn:]
949 p = UCHAR_GENERAL_CATEGORY_MASK;
950 v = U_GC_CN_MASK((uint32_t)1<<(U_GENERAL_OTHER_TYPES));
951 invert = TRUE1;
952 } else {
953 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
954 }
955 }
956 }
957 }
958
959 applyIntPropertyValue(p, v, ec);
960 if(invert) {
961 complement().removeAllStrings(); // code point complement
962 }
963
964 if (isBogus() && U_SUCCESS(ec)) {
965 // We likely ran out of memory. AHHH!
966 ec = U_MEMORY_ALLOCATION_ERROR;
967 }
968 return *this;
969}
970
971//----------------------------------------------------------------
972// Property set patterns
973//----------------------------------------------------------------
974
975/**
976 * Return true if the given position, in the given pattern, appears
977 * to be the start of a property set pattern.
978 */
979UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
980 int32_t pos) {
981 // Patterns are at least 5 characters long
982 if ((pos+5) > pattern.length()) {
983 return FALSE0;
984 }
985
986 // Look for an opening [:, [:^, \p, or \P
987 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
988}
989
990/**
991 * Return true if the given iterator appears to point at a
992 * property pattern. Regardless of the result, return with the
993 * iterator unchanged.
994 * @param chars iterator over the pattern characters. Upon return
995 * it will be unchanged.
996 * @param iterOpts RuleCharacterIterator options
997 */
998UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
999 int32_t iterOpts) {
1000 // NOTE: literal will always be FALSE, because we don't parse escapes.
1001 UBool result = FALSE0, literal;
1002 UErrorCode ec = U_ZERO_ERROR;
1003 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1004 RuleCharacterIterator::Pos pos;
1005 chars.getPos(pos);
1006 UChar32 c = chars.next(iterOpts, literal, ec);
1007 if (c == u'[' || c == u'\\') {
1008 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1009 literal, ec);
1010 result = (c == u'[') ? (d == u':') :
1011 (d == u'N' || d == u'p' || d == u'P');
1012 }
1013 chars.setPos(pos);
1014 return result && U_SUCCESS(ec);
1015}
1016
1017/**
1018 * Parse the given property pattern at the given parse position.
1019 */
1020UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1021 ParsePosition& ppos,
1022 UErrorCode &ec) {
1023 int32_t pos = ppos.getIndex();
1024
1025 UBool posix = FALSE0; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1026 UBool isName = FALSE0; // true for \N{pat}, o/w false
1027 UBool invert = FALSE0;
1028
1029 if (U_FAILURE(ec)) return *this;
1030
1031 // Minimum length is 5 characters, e.g. \p{L}
1032 if ((pos+5) > pattern.length()) {
1033 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
1034 }
1035
1036 // On entry, ppos should point to one of the following locations:
1037 // Look for an opening [:, [:^, \p, or \P
1038 if (isPOSIXOpen(pattern, pos)) {
1039 posix = TRUE1;
1040 pos += 2;
1041 pos = ICU_Utility::skipWhitespace(pattern, pos);
1042 if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
1043 ++pos;
1044 invert = TRUE1;
1045 }
1046 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1047 UChar c = pattern.charAt(pos+1);
1048 invert = (c == u'P');
1049 isName = (c == u'N');
1050 pos += 2;
1051 pos = ICU_Utility::skipWhitespace(pattern, pos);
1052 if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
1053 // Syntax error; "\p" or "\P" not followed by "{"
1054 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
1055 }
1056 } else {
1057 // Open delimiter not seen
1058 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
1059 }
1060
1061 // Look for the matching close delimiter, either :] or }
1062 int32_t close;
1063 if (posix) {
1064 close = pattern.indexOf(u":]", 2, pos);
1065 } else {
1066 close = pattern.indexOf(u'}', pos);
1067 }
1068 if (close < 0) {
1069 // Syntax error; close delimiter missing
1070 FAIL(ec)do { ec=U_ILLEGAL_ARGUMENT_ERROR; return *this; } while (false
)
;
1071 }
1072
1073 // Look for an '=' sign. If this is present, we will parse a
1074 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1075 // pattern.
1076 int32_t equals = pattern.indexOf(u'=', pos);
1077 UnicodeString propName, valueName;
1078 if (equals >= 0 && equals < close && !isName) {
1079 // Equals seen; parse medium/long pattern
1080 pattern.extractBetween(pos, equals, propName);
1081 pattern.extractBetween(equals+1, close, valueName);
1082 }
1083
1084 else {
1085 // Handle case where no '=' is seen, and \N{}
1086 pattern.extractBetween(pos, close, propName);
1087
1088 // Handle \N{name}
1089 if (isName) {
1090 // This is a little inefficient since it means we have to
1091 // parse NAME_PROP back to UCHAR_NAME even though we already
1092 // know it's UCHAR_NAME. If we refactor the API to
1093 // support args of (UProperty, char*) then we can remove
1094 // NAME_PROP and make this a little more efficient.
1095 valueName = propName;
1096 propName = UnicodeString(NAME_PROP"na", NAME_PROP_LENGTH2, US_INVicu::UnicodeString::kInvariant);
1097 }
1098 }
1099
1100 applyPropertyAlias(propName, valueName, ec);
1101
1102 if (U_SUCCESS(ec)) {
1103 if (invert) {
1104 complement().removeAllStrings(); // code point complement
1105 }
1106
1107 // Move to the limit position after the close delimiter if the
1108 // parse succeeded.
1109 ppos.setIndex(close + (posix ? 2 : 1));
1110 }
1111
1112 return *this;
1113}
1114
1115/**
1116 * Parse a property pattern.
1117 * @param chars iterator over the pattern characters. Upon return
1118 * it will be advanced to the first character after the parsed
1119 * pattern, or the end of the iteration if all characters are
1120 * parsed.
1121 * @param rebuiltPat the pattern that was parsed, rebuilt or
1122 * copied from the input pattern, as appropriate.
1123 */
1124void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1125 UnicodeString& rebuiltPat,
1126 UErrorCode& ec) {
1127 if (U_FAILURE(ec)) return;
1128 UnicodeString pattern;
1129 chars.lookahead(pattern);
1130 ParsePosition pos(0);
1131 applyPropertyPattern(pattern, pos, ec);
1132 if (U_FAILURE(ec)) return;
1133 if (pos.getIndex() == 0) {
1134 // syntaxError(chars, "Invalid property pattern");
1135 ec = U_MALFORMED_SET;
1136 return;
1137 }
1138 chars.jumpahead(pos.getIndex());
1139 rebuiltPat.append(pattern, 0, pos.getIndex());
1140}
1141
1142U_NAMESPACE_END}