michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * michael@0: * indentifier_info.h michael@0: * michael@0: * created on: 2013 Jan 7 michael@0: * created by: Andy Heninger michael@0: */ michael@0: michael@0: #ifndef __IDENTIFIER_INFO_H__ michael@0: #define __IDENTIFIER_INFO_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/uspoof.h" michael@0: #include "uhash.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class ScriptSet; michael@0: michael@0: // TODO(andy): review consistency of reference vs pointer arguments to the funcions. michael@0: michael@0: /** michael@0: * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile michael@0: * then setIdentifier. Available methods include: michael@0: *
    michael@0: *
  1. call getScripts for the specific scripts in the identifier. The identifier contains at least one character in michael@0: * each of these. michael@0: *
  2. call getAlternates to get cases where a character is not limited to a single script. For example, it could be michael@0: * either Katakana or Hiragana. michael@0: *
  3. call getCommonAmongAlternates to find out if any scripts are common to all the alternates. michael@0: *
  4. call getNumerics to get a representative character (with value zero) for each of the decimal number systems in michael@0: * the identifier. michael@0: *
  5. call getRestrictionLevel to see what the UTS36 restriction level is. michael@0: *
michael@0: * michael@0: * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo michael@0: */ michael@0: class U_I18N_API IdentifierInfo : public UMemory { michael@0: michael@0: public: michael@0: /** michael@0: * Create an identifier info object. Subsequently, call setIdentifier(), etc. michael@0: * @internal michael@0: */ michael@0: IdentifierInfo(UErrorCode &status); michael@0: michael@0: /** michael@0: * Destructor michael@0: */ michael@0: virtual ~IdentifierInfo(); michael@0: michael@0: private: michael@0: /* Disallow copying for now. Can be added if there's a need. */ michael@0: IdentifierInfo(const IdentifierInfo &other); michael@0: michael@0: public: michael@0: michael@0: /** michael@0: * Set the identifier profile: the characters that are to be allowed in the identifier. michael@0: * michael@0: * @param identifierProfile the characters that are to be allowed in the identifier michael@0: * @return this michael@0: * @internal michael@0: */ michael@0: IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); michael@0: michael@0: /** michael@0: * Get the identifier profile: the characters that are to be allowed in the identifier. michael@0: * michael@0: * @return The characters that are to be allowed in the identifier. michael@0: * @internal michael@0: */ michael@0: const UnicodeSet &getIdentifierProfile() const; michael@0: michael@0: michael@0: /** michael@0: * Set an identifier to analyze. Afterwards, call methods like getScripts() michael@0: * michael@0: * @param identifier the identifier to analyze michael@0: * @param status Errorcode, set if errors occur. michael@0: * @return this michael@0: * @internal michael@0: */ michael@0: IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); michael@0: michael@0: michael@0: /** michael@0: * Get the identifier that was analyzed. The returned string is owned by the ICU library, michael@0: * and must not be deleted by the caller. michael@0: * michael@0: * @return the identifier that was analyzed. michael@0: * @internal michael@0: */ michael@0: const UnicodeString *getIdentifier() const; michael@0: michael@0: michael@0: /** michael@0: * Get the scripts found in the identifiers. michael@0: * michael@0: * @return the set of explicit scripts. michael@0: * @internal michael@0: */ michael@0: const ScriptSet *getScripts() const; michael@0: michael@0: /** michael@0: * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then michael@0: * the set consisting of those scripts will be returned. michael@0: * michael@0: * @return a uhash, with each key being of type (ScriptSet *). michael@0: * This is a set, not a map, so the value stored in the uhash is not relevant. michael@0: * (It is, in fact, 1). michael@0: * Ownership of the uhash and its contents remains with the IndetifierInfo object, michael@0: * and remains valid until a new identifer is set or until the object is deleted. michael@0: * @internal michael@0: */ michael@0: const UHashtable *getAlternates() const; michael@0: michael@0: /** michael@0: * Get the representative characters (zeros) for the numerics found in the identifier. michael@0: * michael@0: * @return the set of explicit scripts. michael@0: * @internal michael@0: */ michael@0: const UnicodeSet *getNumerics() const; michael@0: michael@0: /** michael@0: * Find out which scripts are in common among the alternates. michael@0: * michael@0: * @return the set of scripts that are in common among the alternates. michael@0: * @internal michael@0: */ michael@0: const ScriptSet *getCommonAmongAlternates() const; michael@0: michael@0: /** michael@0: * Get the number of scripts appearing in the identifier. michael@0: * Note: Common and Inherited scripts are omitted from the count. michael@0: * Note: Result may be high when the identifier contains characters michael@0: * with alternate scripts. The distinction between michael@0: * 0, 1 and > 1 will remain valid, however. michael@0: * @return the number of scripts. michael@0: */ michael@0: int32_t getScriptCount() const; michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: /** michael@0: * Find the "tightest" restriction level that the identifier satisfies. michael@0: * michael@0: * @return the restriction level. michael@0: * @internal michael@0: */ michael@0: URestrictionLevel getRestrictionLevel(UErrorCode &status) const; michael@0: michael@0: #endif /*!UCONFIG_NO_NORMALIZATION */ michael@0: michael@0: UnicodeString toString() const; michael@0: michael@0: /** michael@0: * Produce a readable string of alternates. michael@0: * michael@0: * @param alternates a UHashtable of UScriptSets. michael@0: * Keys only, no meaningful values in the UHash. michael@0: * @return display form michael@0: * @internal michael@0: */ michael@0: static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); michael@0: michael@0: /** michael@0: * Static memory cleanup function. michael@0: * @internal michael@0: */ michael@0: static UBool cleanup(); michael@0: private: michael@0: michael@0: IdentifierInfo & clear(); michael@0: UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; michael@0: michael@0: UnicodeString *fIdentifier; michael@0: ScriptSet *fRequiredScripts; michael@0: UHashtable *fScriptSetSet; michael@0: ScriptSet *fCommonAmongAlternates; michael@0: UnicodeSet *fNumerics; michael@0: UnicodeSet *fIdentifierProfile; michael@0: michael@0: static UnicodeSet *ASCII; michael@0: static ScriptSet *JAPANESE; michael@0: static ScriptSet *CHINESE; michael@0: static ScriptSet *KOREAN; michael@0: static ScriptSet *CONFUSABLE_WITH_LATIN; michael@0: michael@0: michael@0: michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif // __IDENTIFIER_INFO_H__ michael@0: