intl/icu/source/i18n/identifier_info.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 *
michael@0 7 * indentifier_info.h
michael@0 8 *
michael@0 9 * created on: 2013 Jan 7
michael@0 10 * created by: Andy Heninger
michael@0 11 */
michael@0 12
michael@0 13 #ifndef __IDENTIFIER_INFO_H__
michael@0 14 #define __IDENTIFIER_INFO_H__
michael@0 15
michael@0 16 #include "unicode/utypes.h"
michael@0 17
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/uspoof.h"
michael@0 20 #include "uhash.h"
michael@0 21
michael@0 22 U_NAMESPACE_BEGIN
michael@0 23
michael@0 24 class ScriptSet;
michael@0 25
michael@0 26 // TODO(andy): review consistency of reference vs pointer arguments to the funcions.
michael@0 27
michael@0 28 /**
michael@0 29 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
michael@0 30 * then setIdentifier. Available methods include:
michael@0 31 * <ol>
michael@0 32 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
michael@0 33 * each of these.
michael@0 34 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
michael@0 35 * either Katakana or Hiragana.
michael@0 36 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
michael@0 37 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
michael@0 38 * the identifier.
michael@0 39 * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
michael@0 40 * </ol>
michael@0 41 *
michael@0 42 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
michael@0 43 */
michael@0 44 class U_I18N_API IdentifierInfo : public UMemory {
michael@0 45
michael@0 46 public:
michael@0 47 /**
michael@0 48 * Create an identifier info object. Subsequently, call setIdentifier(), etc.
michael@0 49 * @internal
michael@0 50 */
michael@0 51 IdentifierInfo(UErrorCode &status);
michael@0 52
michael@0 53 /**
michael@0 54 * Destructor
michael@0 55 */
michael@0 56 virtual ~IdentifierInfo();
michael@0 57
michael@0 58 private:
michael@0 59 /* Disallow copying for now. Can be added if there's a need. */
michael@0 60 IdentifierInfo(const IdentifierInfo &other);
michael@0 61
michael@0 62 public:
michael@0 63
michael@0 64 /**
michael@0 65 * Set the identifier profile: the characters that are to be allowed in the identifier.
michael@0 66 *
michael@0 67 * @param identifierProfile the characters that are to be allowed in the identifier
michael@0 68 * @return this
michael@0 69 * @internal
michael@0 70 */
michael@0 71 IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
michael@0 72
michael@0 73 /**
michael@0 74 * Get the identifier profile: the characters that are to be allowed in the identifier.
michael@0 75 *
michael@0 76 * @return The characters that are to be allowed in the identifier.
michael@0 77 * @internal
michael@0 78 */
michael@0 79 const UnicodeSet &getIdentifierProfile() const;
michael@0 80
michael@0 81
michael@0 82 /**
michael@0 83 * Set an identifier to analyze. Afterwards, call methods like getScripts()
michael@0 84 *
michael@0 85 * @param identifier the identifier to analyze
michael@0 86 * @param status Errorcode, set if errors occur.
michael@0 87 * @return this
michael@0 88 * @internal
michael@0 89 */
michael@0 90 IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
michael@0 91
michael@0 92
michael@0 93 /**
michael@0 94 * Get the identifier that was analyzed. The returned string is owned by the ICU library,
michael@0 95 * and must not be deleted by the caller.
michael@0 96 *
michael@0 97 * @return the identifier that was analyzed.
michael@0 98 * @internal
michael@0 99 */
michael@0 100 const UnicodeString *getIdentifier() const;
michael@0 101
michael@0 102
michael@0 103 /**
michael@0 104 * Get the scripts found in the identifiers.
michael@0 105 *
michael@0 106 * @return the set of explicit scripts.
michael@0 107 * @internal
michael@0 108 */
michael@0 109 const ScriptSet *getScripts() const;
michael@0 110
michael@0 111 /**
michael@0 112 * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
michael@0 113 * the set consisting of those scripts will be returned.
michael@0 114 *
michael@0 115 * @return a uhash, with each key being of type (ScriptSet *).
michael@0 116 * This is a set, not a map, so the value stored in the uhash is not relevant.
michael@0 117 * (It is, in fact, 1).
michael@0 118 * Ownership of the uhash and its contents remains with the IndetifierInfo object,
michael@0 119 * and remains valid until a new identifer is set or until the object is deleted.
michael@0 120 * @internal
michael@0 121 */
michael@0 122 const UHashtable *getAlternates() const;
michael@0 123
michael@0 124 /**
michael@0 125 * Get the representative characters (zeros) for the numerics found in the identifier.
michael@0 126 *
michael@0 127 * @return the set of explicit scripts.
michael@0 128 * @internal
michael@0 129 */
michael@0 130 const UnicodeSet *getNumerics() const;
michael@0 131
michael@0 132 /**
michael@0 133 * Find out which scripts are in common among the alternates.
michael@0 134 *
michael@0 135 * @return the set of scripts that are in common among the alternates.
michael@0 136 * @internal
michael@0 137 */
michael@0 138 const ScriptSet *getCommonAmongAlternates() const;
michael@0 139
michael@0 140 /**
michael@0 141 * Get the number of scripts appearing in the identifier.
michael@0 142 * Note: Common and Inherited scripts are omitted from the count.
michael@0 143 * Note: Result may be high when the identifier contains characters
michael@0 144 * with alternate scripts. The distinction between
michael@0 145 * 0, 1 and > 1 will remain valid, however.
michael@0 146 * @return the number of scripts.
michael@0 147 */
michael@0 148 int32_t getScriptCount() const;
michael@0 149
michael@0 150 #if !UCONFIG_NO_NORMALIZATION
michael@0 151
michael@0 152 /**
michael@0 153 * Find the "tightest" restriction level that the identifier satisfies.
michael@0 154 *
michael@0 155 * @return the restriction level.
michael@0 156 * @internal
michael@0 157 */
michael@0 158 URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
michael@0 159
michael@0 160 #endif /*!UCONFIG_NO_NORMALIZATION */
michael@0 161
michael@0 162 UnicodeString toString() const;
michael@0 163
michael@0 164 /**
michael@0 165 * Produce a readable string of alternates.
michael@0 166 *
michael@0 167 * @param alternates a UHashtable of UScriptSets.
michael@0 168 * Keys only, no meaningful values in the UHash.
michael@0 169 * @return display form
michael@0 170 * @internal
michael@0 171 */
michael@0 172 static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
michael@0 173
michael@0 174 /**
michael@0 175 * Static memory cleanup function.
michael@0 176 * @internal
michael@0 177 */
michael@0 178 static UBool cleanup();
michael@0 179 private:
michael@0 180
michael@0 181 IdentifierInfo & clear();
michael@0 182 UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
michael@0 183
michael@0 184 UnicodeString *fIdentifier;
michael@0 185 ScriptSet *fRequiredScripts;
michael@0 186 UHashtable *fScriptSetSet;
michael@0 187 ScriptSet *fCommonAmongAlternates;
michael@0 188 UnicodeSet *fNumerics;
michael@0 189 UnicodeSet *fIdentifierProfile;
michael@0 190
michael@0 191 static UnicodeSet *ASCII;
michael@0 192 static ScriptSet *JAPANESE;
michael@0 193 static ScriptSet *CHINESE;
michael@0 194 static ScriptSet *KOREAN;
michael@0 195 static ScriptSet *CONFUSABLE_WITH_LATIN;
michael@0 196
michael@0 197
michael@0 198
michael@0 199 };
michael@0 200
michael@0 201 U_NAMESPACE_END
michael@0 202
michael@0 203 #endif // __IDENTIFIER_INFO_H__
michael@0 204

mercurial