Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * |
michael@0 | 7 | * indentifier_info.h |
michael@0 | 8 | * |
michael@0 | 9 | * created on: 2013 Jan 7 |
michael@0 | 10 | * created by: Andy Heninger |
michael@0 | 11 | */ |
michael@0 | 12 | |
michael@0 | 13 | #ifndef __IDENTIFIER_INFO_H__ |
michael@0 | 14 | #define __IDENTIFIER_INFO_H__ |
michael@0 | 15 | |
michael@0 | 16 | #include "unicode/utypes.h" |
michael@0 | 17 | |
michael@0 | 18 | #include "unicode/uniset.h" |
michael@0 | 19 | #include "unicode/uspoof.h" |
michael@0 | 20 | #include "uhash.h" |
michael@0 | 21 | |
michael@0 | 22 | U_NAMESPACE_BEGIN |
michael@0 | 23 | |
michael@0 | 24 | class ScriptSet; |
michael@0 | 25 | |
michael@0 | 26 | // TODO(andy): review consistency of reference vs pointer arguments to the funcions. |
michael@0 | 27 | |
michael@0 | 28 | /** |
michael@0 | 29 | * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile |
michael@0 | 30 | * then setIdentifier. Available methods include: |
michael@0 | 31 | * <ol> |
michael@0 | 32 | * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in |
michael@0 | 33 | * each of these. |
michael@0 | 34 | * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be |
michael@0 | 35 | * either Katakana or Hiragana. |
michael@0 | 36 | * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. |
michael@0 | 37 | * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in |
michael@0 | 38 | * the identifier. |
michael@0 | 39 | * <li>call getRestrictionLevel to see what the UTS36 restriction level is. |
michael@0 | 40 | * </ol> |
michael@0 | 41 | * |
michael@0 | 42 | * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo |
michael@0 | 43 | */ |
michael@0 | 44 | class U_I18N_API IdentifierInfo : public UMemory { |
michael@0 | 45 | |
michael@0 | 46 | public: |
michael@0 | 47 | /** |
michael@0 | 48 | * Create an identifier info object. Subsequently, call setIdentifier(), etc. |
michael@0 | 49 | * @internal |
michael@0 | 50 | */ |
michael@0 | 51 | IdentifierInfo(UErrorCode &status); |
michael@0 | 52 | |
michael@0 | 53 | /** |
michael@0 | 54 | * Destructor |
michael@0 | 55 | */ |
michael@0 | 56 | virtual ~IdentifierInfo(); |
michael@0 | 57 | |
michael@0 | 58 | private: |
michael@0 | 59 | /* Disallow copying for now. Can be added if there's a need. */ |
michael@0 | 60 | IdentifierInfo(const IdentifierInfo &other); |
michael@0 | 61 | |
michael@0 | 62 | public: |
michael@0 | 63 | |
michael@0 | 64 | /** |
michael@0 | 65 | * Set the identifier profile: the characters that are to be allowed in the identifier. |
michael@0 | 66 | * |
michael@0 | 67 | * @param identifierProfile the characters that are to be allowed in the identifier |
michael@0 | 68 | * @return this |
michael@0 | 69 | * @internal |
michael@0 | 70 | */ |
michael@0 | 71 | IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); |
michael@0 | 72 | |
michael@0 | 73 | /** |
michael@0 | 74 | * Get the identifier profile: the characters that are to be allowed in the identifier. |
michael@0 | 75 | * |
michael@0 | 76 | * @return The characters that are to be allowed in the identifier. |
michael@0 | 77 | * @internal |
michael@0 | 78 | */ |
michael@0 | 79 | const UnicodeSet &getIdentifierProfile() const; |
michael@0 | 80 | |
michael@0 | 81 | |
michael@0 | 82 | /** |
michael@0 | 83 | * Set an identifier to analyze. Afterwards, call methods like getScripts() |
michael@0 | 84 | * |
michael@0 | 85 | * @param identifier the identifier to analyze |
michael@0 | 86 | * @param status Errorcode, set if errors occur. |
michael@0 | 87 | * @return this |
michael@0 | 88 | * @internal |
michael@0 | 89 | */ |
michael@0 | 90 | IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); |
michael@0 | 91 | |
michael@0 | 92 | |
michael@0 | 93 | /** |
michael@0 | 94 | * Get the identifier that was analyzed. The returned string is owned by the ICU library, |
michael@0 | 95 | * and must not be deleted by the caller. |
michael@0 | 96 | * |
michael@0 | 97 | * @return the identifier that was analyzed. |
michael@0 | 98 | * @internal |
michael@0 | 99 | */ |
michael@0 | 100 | const UnicodeString *getIdentifier() const; |
michael@0 | 101 | |
michael@0 | 102 | |
michael@0 | 103 | /** |
michael@0 | 104 | * Get the scripts found in the identifiers. |
michael@0 | 105 | * |
michael@0 | 106 | * @return the set of explicit scripts. |
michael@0 | 107 | * @internal |
michael@0 | 108 | */ |
michael@0 | 109 | const ScriptSet *getScripts() const; |
michael@0 | 110 | |
michael@0 | 111 | /** |
michael@0 | 112 | * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then |
michael@0 | 113 | * the set consisting of those scripts will be returned. |
michael@0 | 114 | * |
michael@0 | 115 | * @return a uhash, with each key being of type (ScriptSet *). |
michael@0 | 116 | * This is a set, not a map, so the value stored in the uhash is not relevant. |
michael@0 | 117 | * (It is, in fact, 1). |
michael@0 | 118 | * Ownership of the uhash and its contents remains with the IndetifierInfo object, |
michael@0 | 119 | * and remains valid until a new identifer is set or until the object is deleted. |
michael@0 | 120 | * @internal |
michael@0 | 121 | */ |
michael@0 | 122 | const UHashtable *getAlternates() const; |
michael@0 | 123 | |
michael@0 | 124 | /** |
michael@0 | 125 | * Get the representative characters (zeros) for the numerics found in the identifier. |
michael@0 | 126 | * |
michael@0 | 127 | * @return the set of explicit scripts. |
michael@0 | 128 | * @internal |
michael@0 | 129 | */ |
michael@0 | 130 | const UnicodeSet *getNumerics() const; |
michael@0 | 131 | |
michael@0 | 132 | /** |
michael@0 | 133 | * Find out which scripts are in common among the alternates. |
michael@0 | 134 | * |
michael@0 | 135 | * @return the set of scripts that are in common among the alternates. |
michael@0 | 136 | * @internal |
michael@0 | 137 | */ |
michael@0 | 138 | const ScriptSet *getCommonAmongAlternates() const; |
michael@0 | 139 | |
michael@0 | 140 | /** |
michael@0 | 141 | * Get the number of scripts appearing in the identifier. |
michael@0 | 142 | * Note: Common and Inherited scripts are omitted from the count. |
michael@0 | 143 | * Note: Result may be high when the identifier contains characters |
michael@0 | 144 | * with alternate scripts. The distinction between |
michael@0 | 145 | * 0, 1 and > 1 will remain valid, however. |
michael@0 | 146 | * @return the number of scripts. |
michael@0 | 147 | */ |
michael@0 | 148 | int32_t getScriptCount() const; |
michael@0 | 149 | |
michael@0 | 150 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 151 | |
michael@0 | 152 | /** |
michael@0 | 153 | * Find the "tightest" restriction level that the identifier satisfies. |
michael@0 | 154 | * |
michael@0 | 155 | * @return the restriction level. |
michael@0 | 156 | * @internal |
michael@0 | 157 | */ |
michael@0 | 158 | URestrictionLevel getRestrictionLevel(UErrorCode &status) const; |
michael@0 | 159 | |
michael@0 | 160 | #endif /*!UCONFIG_NO_NORMALIZATION */ |
michael@0 | 161 | |
michael@0 | 162 | UnicodeString toString() const; |
michael@0 | 163 | |
michael@0 | 164 | /** |
michael@0 | 165 | * Produce a readable string of alternates. |
michael@0 | 166 | * |
michael@0 | 167 | * @param alternates a UHashtable of UScriptSets. |
michael@0 | 168 | * Keys only, no meaningful values in the UHash. |
michael@0 | 169 | * @return display form |
michael@0 | 170 | * @internal |
michael@0 | 171 | */ |
michael@0 | 172 | static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); |
michael@0 | 173 | |
michael@0 | 174 | /** |
michael@0 | 175 | * Static memory cleanup function. |
michael@0 | 176 | * @internal |
michael@0 | 177 | */ |
michael@0 | 178 | static UBool cleanup(); |
michael@0 | 179 | private: |
michael@0 | 180 | |
michael@0 | 181 | IdentifierInfo & clear(); |
michael@0 | 182 | UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; |
michael@0 | 183 | |
michael@0 | 184 | UnicodeString *fIdentifier; |
michael@0 | 185 | ScriptSet *fRequiredScripts; |
michael@0 | 186 | UHashtable *fScriptSetSet; |
michael@0 | 187 | ScriptSet *fCommonAmongAlternates; |
michael@0 | 188 | UnicodeSet *fNumerics; |
michael@0 | 189 | UnicodeSet *fIdentifierProfile; |
michael@0 | 190 | |
michael@0 | 191 | static UnicodeSet *ASCII; |
michael@0 | 192 | static ScriptSet *JAPANESE; |
michael@0 | 193 | static ScriptSet *CHINESE; |
michael@0 | 194 | static ScriptSet *KOREAN; |
michael@0 | 195 | static ScriptSet *CONFUSABLE_WITH_LATIN; |
michael@0 | 196 | |
michael@0 | 197 | |
michael@0 | 198 | |
michael@0 | 199 | }; |
michael@0 | 200 | |
michael@0 | 201 | U_NAMESPACE_END |
michael@0 | 202 | |
michael@0 | 203 | #endif // __IDENTIFIER_INFO_H__ |
michael@0 | 204 |