ICU 4.6
4.6
|
00001 /* 00002 *************************************************************************** 00003 * Copyright (C) 1999-2010, International Business Machines Corporation 00004 * and others. All Rights Reserved. 00005 *************************************************************************** 00006 * Date Name Description 00007 * 10/20/99 alan Creation. 00008 *************************************************************************** 00009 */ 00010 00011 #ifndef UNICODESET_H 00012 #define UNICODESET_H 00013 00014 #include "unicode/unifilt.h" 00015 #include "unicode/unistr.h" 00016 #include "unicode/uset.h" 00017 00023 U_NAMESPACE_BEGIN 00024 00025 class BMPSet; 00026 class ParsePosition; 00027 class SymbolTable; 00028 class UnicodeSetStringSpan; 00029 class UVector; 00030 class RuleCharacterIterator; 00031 00272 class U_COMMON_API UnicodeSet : public UnicodeFilter { 00273 00274 int32_t len; // length of list used; 0 <= len <= capacity 00275 int32_t capacity; // capacity of list 00276 UChar32* list; // MUST be terminated with HIGH 00277 BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. 00278 UChar32* buffer; // internal buffer, may be NULL 00279 int32_t bufferCapacity; // capacity of buffer 00280 int32_t patLen; 00281 00291 UChar *pat; 00292 UVector* strings; // maintained in sorted order 00293 UnicodeSetStringSpan *stringSpan; 00294 00295 private: 00296 enum { // constants 00297 kIsBogus = 1 // This set is bogus (i.e. not valid) 00298 }; 00299 uint8_t fFlags; // Bit flag (see constants above) 00300 public: 00310 inline UBool isBogus(void) const; 00311 00328 void setToBogus(); 00329 00330 public: 00331 00332 enum { 00337 MIN_VALUE = 0, 00338 00343 MAX_VALUE = 0x10ffff 00344 }; 00345 00346 //---------------------------------------------------------------- 00347 // Constructors &c 00348 //---------------------------------------------------------------- 00349 00350 public: 00351 00356 UnicodeSet(); 00357 00366 UnicodeSet(UChar32 start, UChar32 end); 00367 00376 UnicodeSet(const UnicodeString& pattern, 00377 UErrorCode& status); 00378 00391 UnicodeSet(const UnicodeString& pattern, 00392 uint32_t options, 00393 const SymbolTable* symbols, 00394 UErrorCode& status); 00395 00409 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 00410 uint32_t options, 00411 const SymbolTable* symbols, 00412 UErrorCode& status); 00413 00418 UnicodeSet(const UnicodeSet& o); 00419 00424 virtual ~UnicodeSet(); 00425 00431 UnicodeSet& operator=(const UnicodeSet& o); 00432 00444 virtual UBool operator==(const UnicodeSet& o) const; 00445 00451 UBool operator!=(const UnicodeSet& o) const; 00452 00462 virtual UnicodeFunctor* clone() const; 00463 00471 virtual int32_t hashCode(void) const; 00472 00481 inline static UnicodeSet *fromUSet(USet *uset); 00482 00491 inline static const UnicodeSet *fromUSet(const USet *uset); 00492 00500 inline USet *toUSet(); 00501 00502 00510 inline const USet * toUSet() const; 00511 00512 00513 //---------------------------------------------------------------- 00514 // Freezable API 00515 //---------------------------------------------------------------- 00516 00525 inline UBool isFrozen() const; 00526 00540 UnicodeFunctor *freeze(); 00541 00550 UnicodeFunctor *cloneAsThawed() const; 00551 00552 //---------------------------------------------------------------- 00553 // Public API 00554 //---------------------------------------------------------------- 00555 00566 UnicodeSet& set(UChar32 start, UChar32 end); 00567 00573 static UBool resemblesPattern(const UnicodeString& pattern, 00574 int32_t pos); 00575 00588 UnicodeSet& applyPattern(const UnicodeString& pattern, 00589 UErrorCode& status); 00590 00607 UnicodeSet& applyPattern(const UnicodeString& pattern, 00608 uint32_t options, 00609 const SymbolTable* symbols, 00610 UErrorCode& status); 00611 00643 UnicodeSet& applyPattern(const UnicodeString& pattern, 00644 ParsePosition& pos, 00645 uint32_t options, 00646 const SymbolTable* symbols, 00647 UErrorCode& status); 00648 00662 virtual UnicodeString& toPattern(UnicodeString& result, 00663 UBool escapeUnprintable = FALSE) const; 00664 00687 UnicodeSet& applyIntPropertyValue(UProperty prop, 00688 int32_t value, 00689 UErrorCode& ec); 00690 00720 UnicodeSet& applyPropertyAlias(const UnicodeString& prop, 00721 const UnicodeString& value, 00722 UErrorCode& ec); 00723 00732 virtual int32_t size(void) const; 00733 00740 virtual UBool isEmpty(void) const; 00741 00749 virtual UBool contains(UChar32 c) const; 00750 00759 virtual UBool contains(UChar32 start, UChar32 end) const; 00760 00768 UBool contains(const UnicodeString& s) const; 00769 00777 virtual UBool containsAll(const UnicodeSet& c) const; 00778 00786 UBool containsAll(const UnicodeString& s) const; 00787 00796 UBool containsNone(UChar32 start, UChar32 end) const; 00797 00805 UBool containsNone(const UnicodeSet& c) const; 00806 00814 UBool containsNone(const UnicodeString& s) const; 00815 00824 inline UBool containsSome(UChar32 start, UChar32 end) const; 00825 00833 inline UBool containsSome(const UnicodeSet& s) const; 00834 00842 inline UBool containsSome(const UnicodeString& s) const; 00843 00862 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 00863 00876 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; 00877 00895 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 00896 00910 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; 00911 00930 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 00931 00949 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 00950 00955 virtual UMatchDegree matches(const Replaceable& text, 00956 int32_t& offset, 00957 int32_t limit, 00958 UBool incremental); 00959 00960 private: 00982 static int32_t matchRest(const Replaceable& text, 00983 int32_t start, int32_t limit, 00984 const UnicodeString& s); 00985 00995 int32_t findCodePoint(UChar32 c) const; 00996 00997 public: 00998 01006 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 01007 01016 int32_t indexOf(UChar32 c) const; 01017 01027 UChar32 charAt(int32_t index) const; 01028 01043 virtual UnicodeSet& add(UChar32 start, UChar32 end); 01044 01052 UnicodeSet& add(UChar32 c); 01053 01065 UnicodeSet& add(const UnicodeString& s); 01066 01067 private: 01073 static int32_t getSingleCP(const UnicodeString& s); 01074 01075 void _add(const UnicodeString& s); 01076 01077 public: 01086 UnicodeSet& addAll(const UnicodeString& s); 01087 01096 UnicodeSet& retainAll(const UnicodeString& s); 01097 01106 UnicodeSet& complementAll(const UnicodeString& s); 01107 01116 UnicodeSet& removeAll(const UnicodeString& s); 01117 01126 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s); 01127 01128 01136 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s); 01137 01151 virtual UnicodeSet& retain(UChar32 start, UChar32 end); 01152 01153 01159 UnicodeSet& retain(UChar32 c); 01160 01174 virtual UnicodeSet& remove(UChar32 start, UChar32 end); 01175 01183 UnicodeSet& remove(UChar32 c); 01184 01194 UnicodeSet& remove(const UnicodeString& s); 01195 01203 virtual UnicodeSet& complement(void); 01204 01219 virtual UnicodeSet& complement(UChar32 start, UChar32 end); 01220 01228 UnicodeSet& complement(UChar32 c); 01229 01240 UnicodeSet& complement(const UnicodeString& s); 01241 01254 virtual UnicodeSet& addAll(const UnicodeSet& c); 01255 01267 virtual UnicodeSet& retainAll(const UnicodeSet& c); 01268 01280 virtual UnicodeSet& removeAll(const UnicodeSet& c); 01281 01292 virtual UnicodeSet& complementAll(const UnicodeSet& c); 01293 01300 virtual UnicodeSet& clear(void); 01301 01327 UnicodeSet& closeOver(int32_t attribute); 01328 01335 virtual UnicodeSet &removeAllStrings(); 01336 01344 virtual int32_t getRangeCount(void) const; 01345 01353 virtual UChar32 getRangeStart(int32_t index) const; 01354 01362 virtual UChar32 getRangeEnd(int32_t index) const; 01363 01412 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const; 01413 01420 virtual UnicodeSet& compact(); 01421 01433 static UClassID U_EXPORT2 getStaticClassID(void); 01434 01443 virtual UClassID getDynamicClassID(void) const; 01444 01445 private: 01446 01447 // Private API for the USet API 01448 01449 friend class USetAccess; 01450 01451 int32_t getStringCount() const; 01452 01453 const UnicodeString* getString(int32_t index) const; 01454 01455 //---------------------------------------------------------------- 01456 // RuleBasedTransliterator support 01457 //---------------------------------------------------------------- 01458 01459 private: 01460 01466 virtual UBool matchesIndexValue(uint8_t v) const; 01467 01468 private: 01469 01470 //---------------------------------------------------------------- 01471 // Implementation: Clone as thawed (see ICU4J Freezable) 01472 //---------------------------------------------------------------- 01473 01474 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); 01475 01476 //---------------------------------------------------------------- 01477 // Implementation: Pattern parsing 01478 //---------------------------------------------------------------- 01479 01480 void applyPattern(RuleCharacterIterator& chars, 01481 const SymbolTable* symbols, 01482 UnicodeString& rebuiltPat, 01483 uint32_t options, 01484 UErrorCode& ec); 01485 01486 //---------------------------------------------------------------- 01487 // Implementation: Utility methods 01488 //---------------------------------------------------------------- 01489 01490 void ensureCapacity(int32_t newLen, UErrorCode& ec); 01491 01492 void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); 01493 01494 void swapBuffers(void); 01495 01496 UBool allocateStrings(UErrorCode &status); 01497 01498 UnicodeString& _toPattern(UnicodeString& result, 01499 UBool escapeUnprintable) const; 01500 01501 UnicodeString& _generatePattern(UnicodeString& result, 01502 UBool escapeUnprintable) const; 01503 01504 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); 01505 01506 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); 01507 01508 //---------------------------------------------------------------- 01509 // Implementation: Fundamental operators 01510 //---------------------------------------------------------------- 01511 01512 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity); 01513 01514 void add(const UChar32* other, int32_t otherLen, int8_t polarity); 01515 01516 void retain(const UChar32* other, int32_t otherLen, int8_t polarity); 01517 01523 static UBool resemblesPropertyPattern(const UnicodeString& pattern, 01524 int32_t pos); 01525 01526 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, 01527 int32_t iterOpts); 01528 01567 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, 01568 ParsePosition& ppos, 01569 UErrorCode &ec); 01570 01571 void applyPropertyPattern(RuleCharacterIterator& chars, 01572 UnicodeString& rebuiltPat, 01573 UErrorCode& ec); 01574 01575 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); 01576 01581 typedef UBool (*Filter)(UChar32 codePoint, void* context); 01582 01592 void applyFilter(Filter filter, 01593 void* context, 01594 int32_t src, 01595 UErrorCode &status); 01596 01600 void setPattern(const UnicodeString& newPat); 01604 void releasePattern(); 01605 01606 friend class UnicodeSetIterator; 01607 }; 01608 01609 01610 01611 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const { 01612 return !operator==(o); 01613 } 01614 01615 inline UBool UnicodeSet::isFrozen() const { 01616 return (UBool)(bmpSet!=NULL || stringSpan!=NULL); 01617 } 01618 01619 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { 01620 return !containsNone(start, end); 01621 } 01622 01623 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const { 01624 return !containsNone(s); 01625 } 01626 01627 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { 01628 return !containsNone(s); 01629 } 01630 01631 inline UBool UnicodeSet::isBogus() const { 01632 return (UBool)(fFlags & kIsBogus); 01633 } 01634 01635 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { 01636 return reinterpret_cast<UnicodeSet *>(uset); 01637 } 01638 01639 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { 01640 return reinterpret_cast<const UnicodeSet *>(uset); 01641 } 01642 01643 inline USet *UnicodeSet::toUSet() { 01644 return reinterpret_cast<USet *>(this); 01645 } 01646 01647 inline const USet *UnicodeSet::toUSet() const { 01648 return reinterpret_cast<const USet *>(this); 01649 } 01650 01651 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { 01652 int32_t sLength=s.length(); 01653 if(start<0) { 01654 start=0; 01655 } else if(start>sLength) { 01656 start=sLength; 01657 } 01658 return start+span(s.getBuffer()+start, sLength-start, spanCondition); 01659 } 01660 01661 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { 01662 int32_t sLength=s.length(); 01663 if(limit<0) { 01664 limit=0; 01665 } else if(limit>sLength) { 01666 limit=sLength; 01667 } 01668 return spanBack(s.getBuffer(), limit, spanCondition); 01669 } 01670 01671 U_NAMESPACE_END 01672 01673 #endif