|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * |
|
7 * indentifier_info.h |
|
8 * |
|
9 * created on: 2013 Jan 7 |
|
10 * created by: Andy Heninger |
|
11 */ |
|
12 |
|
13 #ifndef __IDENTIFIER_INFO_H__ |
|
14 #define __IDENTIFIER_INFO_H__ |
|
15 |
|
16 #include "unicode/utypes.h" |
|
17 |
|
18 #include "unicode/uniset.h" |
|
19 #include "unicode/uspoof.h" |
|
20 #include "uhash.h" |
|
21 |
|
22 U_NAMESPACE_BEGIN |
|
23 |
|
24 class ScriptSet; |
|
25 |
|
26 // TODO(andy): review consistency of reference vs pointer arguments to the funcions. |
|
27 |
|
28 /** |
|
29 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile |
|
30 * then setIdentifier. Available methods include: |
|
31 * <ol> |
|
32 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in |
|
33 * each of these. |
|
34 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be |
|
35 * either Katakana or Hiragana. |
|
36 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. |
|
37 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in |
|
38 * the identifier. |
|
39 * <li>call getRestrictionLevel to see what the UTS36 restriction level is. |
|
40 * </ol> |
|
41 * |
|
42 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo |
|
43 */ |
|
44 class U_I18N_API IdentifierInfo : public UMemory { |
|
45 |
|
46 public: |
|
47 /** |
|
48 * Create an identifier info object. Subsequently, call setIdentifier(), etc. |
|
49 * @internal |
|
50 */ |
|
51 IdentifierInfo(UErrorCode &status); |
|
52 |
|
53 /** |
|
54 * Destructor |
|
55 */ |
|
56 virtual ~IdentifierInfo(); |
|
57 |
|
58 private: |
|
59 /* Disallow copying for now. Can be added if there's a need. */ |
|
60 IdentifierInfo(const IdentifierInfo &other); |
|
61 |
|
62 public: |
|
63 |
|
64 /** |
|
65 * Set the identifier profile: the characters that are to be allowed in the identifier. |
|
66 * |
|
67 * @param identifierProfile the characters that are to be allowed in the identifier |
|
68 * @return this |
|
69 * @internal |
|
70 */ |
|
71 IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); |
|
72 |
|
73 /** |
|
74 * Get the identifier profile: the characters that are to be allowed in the identifier. |
|
75 * |
|
76 * @return The characters that are to be allowed in the identifier. |
|
77 * @internal |
|
78 */ |
|
79 const UnicodeSet &getIdentifierProfile() const; |
|
80 |
|
81 |
|
82 /** |
|
83 * Set an identifier to analyze. Afterwards, call methods like getScripts() |
|
84 * |
|
85 * @param identifier the identifier to analyze |
|
86 * @param status Errorcode, set if errors occur. |
|
87 * @return this |
|
88 * @internal |
|
89 */ |
|
90 IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); |
|
91 |
|
92 |
|
93 /** |
|
94 * Get the identifier that was analyzed. The returned string is owned by the ICU library, |
|
95 * and must not be deleted by the caller. |
|
96 * |
|
97 * @return the identifier that was analyzed. |
|
98 * @internal |
|
99 */ |
|
100 const UnicodeString *getIdentifier() const; |
|
101 |
|
102 |
|
103 /** |
|
104 * Get the scripts found in the identifiers. |
|
105 * |
|
106 * @return the set of explicit scripts. |
|
107 * @internal |
|
108 */ |
|
109 const ScriptSet *getScripts() const; |
|
110 |
|
111 /** |
|
112 * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then |
|
113 * the set consisting of those scripts will be returned. |
|
114 * |
|
115 * @return a uhash, with each key being of type (ScriptSet *). |
|
116 * This is a set, not a map, so the value stored in the uhash is not relevant. |
|
117 * (It is, in fact, 1). |
|
118 * Ownership of the uhash and its contents remains with the IndetifierInfo object, |
|
119 * and remains valid until a new identifer is set or until the object is deleted. |
|
120 * @internal |
|
121 */ |
|
122 const UHashtable *getAlternates() const; |
|
123 |
|
124 /** |
|
125 * Get the representative characters (zeros) for the numerics found in the identifier. |
|
126 * |
|
127 * @return the set of explicit scripts. |
|
128 * @internal |
|
129 */ |
|
130 const UnicodeSet *getNumerics() const; |
|
131 |
|
132 /** |
|
133 * Find out which scripts are in common among the alternates. |
|
134 * |
|
135 * @return the set of scripts that are in common among the alternates. |
|
136 * @internal |
|
137 */ |
|
138 const ScriptSet *getCommonAmongAlternates() const; |
|
139 |
|
140 /** |
|
141 * Get the number of scripts appearing in the identifier. |
|
142 * Note: Common and Inherited scripts are omitted from the count. |
|
143 * Note: Result may be high when the identifier contains characters |
|
144 * with alternate scripts. The distinction between |
|
145 * 0, 1 and > 1 will remain valid, however. |
|
146 * @return the number of scripts. |
|
147 */ |
|
148 int32_t getScriptCount() const; |
|
149 |
|
150 #if !UCONFIG_NO_NORMALIZATION |
|
151 |
|
152 /** |
|
153 * Find the "tightest" restriction level that the identifier satisfies. |
|
154 * |
|
155 * @return the restriction level. |
|
156 * @internal |
|
157 */ |
|
158 URestrictionLevel getRestrictionLevel(UErrorCode &status) const; |
|
159 |
|
160 #endif /*!UCONFIG_NO_NORMALIZATION */ |
|
161 |
|
162 UnicodeString toString() const; |
|
163 |
|
164 /** |
|
165 * Produce a readable string of alternates. |
|
166 * |
|
167 * @param alternates a UHashtable of UScriptSets. |
|
168 * Keys only, no meaningful values in the UHash. |
|
169 * @return display form |
|
170 * @internal |
|
171 */ |
|
172 static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); |
|
173 |
|
174 /** |
|
175 * Static memory cleanup function. |
|
176 * @internal |
|
177 */ |
|
178 static UBool cleanup(); |
|
179 private: |
|
180 |
|
181 IdentifierInfo & clear(); |
|
182 UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; |
|
183 |
|
184 UnicodeString *fIdentifier; |
|
185 ScriptSet *fRequiredScripts; |
|
186 UHashtable *fScriptSetSet; |
|
187 ScriptSet *fCommonAmongAlternates; |
|
188 UnicodeSet *fNumerics; |
|
189 UnicodeSet *fIdentifierProfile; |
|
190 |
|
191 static UnicodeSet *ASCII; |
|
192 static ScriptSet *JAPANESE; |
|
193 static ScriptSet *CHINESE; |
|
194 static ScriptSet *KOREAN; |
|
195 static ScriptSet *CONFUSABLE_WITH_LATIN; |
|
196 |
|
197 |
|
198 |
|
199 }; |
|
200 |
|
201 U_NAMESPACE_END |
|
202 |
|
203 #endif // __IDENTIFIER_INFO_H__ |
|
204 |