1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/identifier_info.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,204 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* 1.10 +* indentifier_info.h 1.11 +* 1.12 +* created on: 2013 Jan 7 1.13 +* created by: Andy Heninger 1.14 +*/ 1.15 + 1.16 +#ifndef __IDENTIFIER_INFO_H__ 1.17 +#define __IDENTIFIER_INFO_H__ 1.18 + 1.19 +#include "unicode/utypes.h" 1.20 + 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/uspoof.h" 1.23 +#include "uhash.h" 1.24 + 1.25 +U_NAMESPACE_BEGIN 1.26 + 1.27 +class ScriptSet; 1.28 + 1.29 +// TODO(andy): review consistency of reference vs pointer arguments to the funcions. 1.30 + 1.31 +/** 1.32 + * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile 1.33 + * then setIdentifier. Available methods include: 1.34 + * <ol> 1.35 + * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in 1.36 + * each of these. 1.37 + * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be 1.38 + * either Katakana or Hiragana. 1.39 + * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. 1.40 + * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in 1.41 + * the identifier. 1.42 + * <li>call getRestrictionLevel to see what the UTS36 restriction level is. 1.43 + * </ol> 1.44 + * 1.45 + * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo 1.46 + */ 1.47 +class U_I18N_API IdentifierInfo : public UMemory { 1.48 + 1.49 + public: 1.50 + /** 1.51 + * Create an identifier info object. Subsequently, call setIdentifier(), etc. 1.52 + * @internal 1.53 + */ 1.54 + IdentifierInfo(UErrorCode &status); 1.55 + 1.56 + /** 1.57 + * Destructor 1.58 + */ 1.59 + virtual ~IdentifierInfo(); 1.60 + 1.61 + private: 1.62 + /* Disallow copying for now. Can be added if there's a need. */ 1.63 + IdentifierInfo(const IdentifierInfo &other); 1.64 + 1.65 + public: 1.66 + 1.67 + /** 1.68 + * Set the identifier profile: the characters that are to be allowed in the identifier. 1.69 + * 1.70 + * @param identifierProfile the characters that are to be allowed in the identifier 1.71 + * @return this 1.72 + * @internal 1.73 + */ 1.74 + IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); 1.75 + 1.76 + /** 1.77 + * Get the identifier profile: the characters that are to be allowed in the identifier. 1.78 + * 1.79 + * @return The characters that are to be allowed in the identifier. 1.80 + * @internal 1.81 + */ 1.82 + const UnicodeSet &getIdentifierProfile() const; 1.83 + 1.84 + 1.85 + /** 1.86 + * Set an identifier to analyze. Afterwards, call methods like getScripts() 1.87 + * 1.88 + * @param identifier the identifier to analyze 1.89 + * @param status Errorcode, set if errors occur. 1.90 + * @return this 1.91 + * @internal 1.92 + */ 1.93 + IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); 1.94 + 1.95 + 1.96 + /** 1.97 + * Get the identifier that was analyzed. The returned string is owned by the ICU library, 1.98 + * and must not be deleted by the caller. 1.99 + * 1.100 + * @return the identifier that was analyzed. 1.101 + * @internal 1.102 + */ 1.103 + const UnicodeString *getIdentifier() const; 1.104 + 1.105 + 1.106 + /** 1.107 + * Get the scripts found in the identifiers. 1.108 + * 1.109 + * @return the set of explicit scripts. 1.110 + * @internal 1.111 + */ 1.112 + const ScriptSet *getScripts() const; 1.113 + 1.114 + /** 1.115 + * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then 1.116 + * the set consisting of those scripts will be returned. 1.117 + * 1.118 + * @return a uhash, with each key being of type (ScriptSet *). 1.119 + * This is a set, not a map, so the value stored in the uhash is not relevant. 1.120 + * (It is, in fact, 1). 1.121 + * Ownership of the uhash and its contents remains with the IndetifierInfo object, 1.122 + * and remains valid until a new identifer is set or until the object is deleted. 1.123 + * @internal 1.124 + */ 1.125 + const UHashtable *getAlternates() const; 1.126 + 1.127 + /** 1.128 + * Get the representative characters (zeros) for the numerics found in the identifier. 1.129 + * 1.130 + * @return the set of explicit scripts. 1.131 + * @internal 1.132 + */ 1.133 + const UnicodeSet *getNumerics() const; 1.134 + 1.135 + /** 1.136 + * Find out which scripts are in common among the alternates. 1.137 + * 1.138 + * @return the set of scripts that are in common among the alternates. 1.139 + * @internal 1.140 + */ 1.141 + const ScriptSet *getCommonAmongAlternates() const; 1.142 + 1.143 + /** 1.144 + * Get the number of scripts appearing in the identifier. 1.145 + * Note: Common and Inherited scripts are omitted from the count. 1.146 + * Note: Result may be high when the identifier contains characters 1.147 + * with alternate scripts. The distinction between 1.148 + * 0, 1 and > 1 will remain valid, however. 1.149 + * @return the number of scripts. 1.150 + */ 1.151 + int32_t getScriptCount() const; 1.152 + 1.153 +#if !UCONFIG_NO_NORMALIZATION 1.154 + 1.155 + /** 1.156 + * Find the "tightest" restriction level that the identifier satisfies. 1.157 + * 1.158 + * @return the restriction level. 1.159 + * @internal 1.160 + */ 1.161 + URestrictionLevel getRestrictionLevel(UErrorCode &status) const; 1.162 + 1.163 +#endif /*!UCONFIG_NO_NORMALIZATION */ 1.164 + 1.165 + UnicodeString toString() const; 1.166 + 1.167 + /** 1.168 + * Produce a readable string of alternates. 1.169 + * 1.170 + * @param alternates a UHashtable of UScriptSets. 1.171 + * Keys only, no meaningful values in the UHash. 1.172 + * @return display form 1.173 + * @internal 1.174 + */ 1.175 + static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); 1.176 + 1.177 + /** 1.178 + * Static memory cleanup function. 1.179 + * @internal 1.180 + */ 1.181 + static UBool cleanup(); 1.182 + private: 1.183 + 1.184 + IdentifierInfo & clear(); 1.185 + UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; 1.186 + 1.187 + UnicodeString *fIdentifier; 1.188 + ScriptSet *fRequiredScripts; 1.189 + UHashtable *fScriptSetSet; 1.190 + ScriptSet *fCommonAmongAlternates; 1.191 + UnicodeSet *fNumerics; 1.192 + UnicodeSet *fIdentifierProfile; 1.193 + 1.194 + static UnicodeSet *ASCII; 1.195 + static ScriptSet *JAPANESE; 1.196 + static ScriptSet *CHINESE; 1.197 + static ScriptSet *KOREAN; 1.198 + static ScriptSet *CONFUSABLE_WITH_LATIN; 1.199 + 1.200 + 1.201 + 1.202 +}; 1.203 + 1.204 +U_NAMESPACE_END 1.205 + 1.206 +#endif // __IDENTIFIER_INFO_H__ 1.207 +