1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/caniter.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,208 @@ 1.4 +/* 1.5 + ******************************************************************************* 1.6 + * Copyright (C) 1996-2011, International Business Machines Corporation and 1.7 + * others. All Rights Reserved. 1.8 + ******************************************************************************* 1.9 + */ 1.10 + 1.11 +#ifndef CANITER_H 1.12 +#define CANITER_H 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_NORMALIZATION 1.17 + 1.18 +#include "unicode/uobject.h" 1.19 +#include "unicode/unistr.h" 1.20 + 1.21 +/** 1.22 + * \file 1.23 + * \brief C++ API: Canonical Iterator 1.24 + */ 1.25 + 1.26 +/** Should permutation skip characters with combining class zero 1.27 + * Should be either TRUE or FALSE. This is a compile time option 1.28 + * @stable ICU 2.4 1.29 + */ 1.30 +#ifndef CANITER_SKIP_ZEROES 1.31 +#define CANITER_SKIP_ZEROES TRUE 1.32 +#endif 1.33 + 1.34 +U_NAMESPACE_BEGIN 1.35 + 1.36 +class Hashtable; 1.37 +class Normalizer2; 1.38 +class Normalizer2Impl; 1.39 + 1.40 +/** 1.41 + * This class allows one to iterate through all the strings that are canonically equivalent to a given 1.42 + * string. For example, here are some sample results: 1.43 +Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.44 +1: \\u0041\\u030A\\u0064\\u0307\\u0327 1.45 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.46 +2: \\u0041\\u030A\\u0064\\u0327\\u0307 1.47 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.48 +3: \\u0041\\u030A\\u1E0B\\u0327 1.49 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.50 +4: \\u0041\\u030A\\u1E11\\u0307 1.51 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.52 +5: \\u00C5\\u0064\\u0307\\u0327 1.53 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.54 +6: \\u00C5\\u0064\\u0327\\u0307 1.55 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.56 +7: \\u00C5\\u1E0B\\u0327 1.57 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.58 +8: \\u00C5\\u1E11\\u0307 1.59 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.60 +9: \\u212B\\u0064\\u0307\\u0327 1.61 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.62 +10: \\u212B\\u0064\\u0327\\u0307 1.63 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.64 +11: \\u212B\\u1E0B\\u0327 1.65 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.66 +12: \\u212B\\u1E11\\u0307 1.67 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.68 + *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 1.69 + * since it has not been optimized for that situation. 1.70 + * Note, CanonicalIterator is not intended to be subclassed. 1.71 + * @author M. Davis 1.72 + * @author C++ port by V. Weinstein 1.73 + * @stable ICU 2.4 1.74 + */ 1.75 +class U_COMMON_API CanonicalIterator : public UObject { 1.76 +public: 1.77 + /** 1.78 + * Construct a CanonicalIterator object 1.79 + * @param source string to get results for 1.80 + * @param status Fill-in parameter which receives the status of this operation. 1.81 + * @stable ICU 2.4 1.82 + */ 1.83 + CanonicalIterator(const UnicodeString &source, UErrorCode &status); 1.84 + 1.85 + /** Destructor 1.86 + * Cleans pieces 1.87 + * @stable ICU 2.4 1.88 + */ 1.89 + virtual ~CanonicalIterator(); 1.90 + 1.91 + /** 1.92 + * Gets the NFD form of the current source we are iterating over. 1.93 + * @return gets the source: NOTE: it is the NFD form of source 1.94 + * @stable ICU 2.4 1.95 + */ 1.96 + UnicodeString getSource(); 1.97 + 1.98 + /** 1.99 + * Resets the iterator so that one can start again from the beginning. 1.100 + * @stable ICU 2.4 1.101 + */ 1.102 + void reset(); 1.103 + 1.104 + /** 1.105 + * Get the next canonically equivalent string. 1.106 + * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 1.107 + * @return the next string that is canonically equivalent. A bogus string is returned when 1.108 + * the iteration is done. 1.109 + * @stable ICU 2.4 1.110 + */ 1.111 + UnicodeString next(); 1.112 + 1.113 + /** 1.114 + * Set a new source for this iterator. Allows object reuse. 1.115 + * @param newSource the source string to iterate against. This allows the same iterator to be used 1.116 + * while changing the source string, saving object creation. 1.117 + * @param status Fill-in parameter which receives the status of this operation. 1.118 + * @stable ICU 2.4 1.119 + */ 1.120 + void setSource(const UnicodeString &newSource, UErrorCode &status); 1.121 + 1.122 +#ifndef U_HIDE_INTERNAL_API 1.123 + /** 1.124 + * Dumb recursive implementation of permutation. 1.125 + * TODO: optimize 1.126 + * @param source the string to find permutations for 1.127 + * @param skipZeros determine if skip zeros 1.128 + * @param result the results in a set. 1.129 + * @param status Fill-in parameter which receives the status of this operation. 1.130 + * @internal 1.131 + */ 1.132 + static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status); 1.133 +#endif /* U_HIDE_INTERNAL_API */ 1.134 + 1.135 + /** 1.136 + * ICU "poor man's RTTI", returns a UClassID for this class. 1.137 + * 1.138 + * @stable ICU 2.2 1.139 + */ 1.140 + static UClassID U_EXPORT2 getStaticClassID(); 1.141 + 1.142 + /** 1.143 + * ICU "poor man's RTTI", returns a UClassID for the actual class. 1.144 + * 1.145 + * @stable ICU 2.2 1.146 + */ 1.147 + virtual UClassID getDynamicClassID() const; 1.148 + 1.149 +private: 1.150 + // ===================== PRIVATES ============================== 1.151 + // private default constructor 1.152 + CanonicalIterator(); 1.153 + 1.154 + 1.155 + /** 1.156 + * Copy constructor. Private for now. 1.157 + * @internal 1.158 + */ 1.159 + CanonicalIterator(const CanonicalIterator& other); 1.160 + 1.161 + /** 1.162 + * Assignment operator. Private for now. 1.163 + * @internal 1.164 + */ 1.165 + CanonicalIterator& operator=(const CanonicalIterator& other); 1.166 + 1.167 + // fields 1.168 + UnicodeString source; 1.169 + UBool done; 1.170 + 1.171 + // 2 dimensional array holds the pieces of the string with 1.172 + // their different canonically equivalent representations 1.173 + UnicodeString **pieces; 1.174 + int32_t pieces_length; 1.175 + int32_t *pieces_lengths; 1.176 + 1.177 + // current is used in iterating to combine pieces 1.178 + int32_t *current; 1.179 + int32_t current_length; 1.180 + 1.181 + // transient fields 1.182 + UnicodeString buffer; 1.183 + 1.184 + const Normalizer2 &nfd; 1.185 + const Normalizer2Impl &nfcImpl; 1.186 + 1.187 + // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 1.188 + UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment) 1.189 + 1.190 + //Set getEquivalents2(String segment); 1.191 + Hashtable *getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status); 1.192 + //Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status); 1.193 + 1.194 + /** 1.195 + * See if the decomposition of cp2 is at segment starting at segmentPos 1.196 + * (with canonical rearrangment!) 1.197 + * If so, take the remainder, and return the equivalents 1.198 + */ 1.199 + //Set extract(int comp, String segment, int segmentPos, StringBuffer buffer); 1.200 + Hashtable *extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status); 1.201 + //Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status); 1.202 + 1.203 + void cleanPieces(); 1.204 + 1.205 +}; 1.206 + 1.207 +U_NAMESPACE_END 1.208 + 1.209 +#endif /* #if !UCONFIG_NO_NORMALIZATION */ 1.210 + 1.211 +#endif