michael@0: /*
michael@0: **********************************************************************
michael@0: * Copyright (C) 2013, International Business Machines
michael@0: * Corporation and others. All Rights Reserved.
michael@0: **********************************************************************
michael@0: *
michael@0: * indentifier_info.h
michael@0: *
michael@0: * created on: 2013 Jan 7
michael@0: * created by: Andy Heninger
michael@0: */
michael@0:
michael@0: #ifndef __IDENTIFIER_INFO_H__
michael@0: #define __IDENTIFIER_INFO_H__
michael@0:
michael@0: #include "unicode/utypes.h"
michael@0:
michael@0: #include "unicode/uniset.h"
michael@0: #include "unicode/uspoof.h"
michael@0: #include "uhash.h"
michael@0:
michael@0: U_NAMESPACE_BEGIN
michael@0:
michael@0: class ScriptSet;
michael@0:
michael@0: // TODO(andy): review consistency of reference vs pointer arguments to the funcions.
michael@0:
michael@0: /**
michael@0: * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
michael@0: * then setIdentifier. Available methods include:
michael@0: *
michael@0: * - call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
michael@0: * each of these.
michael@0: *
- call getAlternates to get cases where a character is not limited to a single script. For example, it could be
michael@0: * either Katakana or Hiragana.
michael@0: *
- call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
michael@0: *
- call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
michael@0: * the identifier.
michael@0: *
- call getRestrictionLevel to see what the UTS36 restriction level is.
michael@0: *
michael@0: *
michael@0: * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
michael@0: */
michael@0: class U_I18N_API IdentifierInfo : public UMemory {
michael@0:
michael@0: public:
michael@0: /**
michael@0: * Create an identifier info object. Subsequently, call setIdentifier(), etc.
michael@0: * @internal
michael@0: */
michael@0: IdentifierInfo(UErrorCode &status);
michael@0:
michael@0: /**
michael@0: * Destructor
michael@0: */
michael@0: virtual ~IdentifierInfo();
michael@0:
michael@0: private:
michael@0: /* Disallow copying for now. Can be added if there's a need. */
michael@0: IdentifierInfo(const IdentifierInfo &other);
michael@0:
michael@0: public:
michael@0:
michael@0: /**
michael@0: * Set the identifier profile: the characters that are to be allowed in the identifier.
michael@0: *
michael@0: * @param identifierProfile the characters that are to be allowed in the identifier
michael@0: * @return this
michael@0: * @internal
michael@0: */
michael@0: IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
michael@0:
michael@0: /**
michael@0: * Get the identifier profile: the characters that are to be allowed in the identifier.
michael@0: *
michael@0: * @return The characters that are to be allowed in the identifier.
michael@0: * @internal
michael@0: */
michael@0: const UnicodeSet &getIdentifierProfile() const;
michael@0:
michael@0:
michael@0: /**
michael@0: * Set an identifier to analyze. Afterwards, call methods like getScripts()
michael@0: *
michael@0: * @param identifier the identifier to analyze
michael@0: * @param status Errorcode, set if errors occur.
michael@0: * @return this
michael@0: * @internal
michael@0: */
michael@0: IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
michael@0:
michael@0:
michael@0: /**
michael@0: * Get the identifier that was analyzed. The returned string is owned by the ICU library,
michael@0: * and must not be deleted by the caller.
michael@0: *
michael@0: * @return the identifier that was analyzed.
michael@0: * @internal
michael@0: */
michael@0: const UnicodeString *getIdentifier() const;
michael@0:
michael@0:
michael@0: /**
michael@0: * Get the scripts found in the identifiers.
michael@0: *
michael@0: * @return the set of explicit scripts.
michael@0: * @internal
michael@0: */
michael@0: const ScriptSet *getScripts() const;
michael@0:
michael@0: /**
michael@0: * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
michael@0: * the set consisting of those scripts will be returned.
michael@0: *
michael@0: * @return a uhash, with each key being of type (ScriptSet *).
michael@0: * This is a set, not a map, so the value stored in the uhash is not relevant.
michael@0: * (It is, in fact, 1).
michael@0: * Ownership of the uhash and its contents remains with the IndetifierInfo object,
michael@0: * and remains valid until a new identifer is set or until the object is deleted.
michael@0: * @internal
michael@0: */
michael@0: const UHashtable *getAlternates() const;
michael@0:
michael@0: /**
michael@0: * Get the representative characters (zeros) for the numerics found in the identifier.
michael@0: *
michael@0: * @return the set of explicit scripts.
michael@0: * @internal
michael@0: */
michael@0: const UnicodeSet *getNumerics() const;
michael@0:
michael@0: /**
michael@0: * Find out which scripts are in common among the alternates.
michael@0: *
michael@0: * @return the set of scripts that are in common among the alternates.
michael@0: * @internal
michael@0: */
michael@0: const ScriptSet *getCommonAmongAlternates() const;
michael@0:
michael@0: /**
michael@0: * Get the number of scripts appearing in the identifier.
michael@0: * Note: Common and Inherited scripts are omitted from the count.
michael@0: * Note: Result may be high when the identifier contains characters
michael@0: * with alternate scripts. The distinction between
michael@0: * 0, 1 and > 1 will remain valid, however.
michael@0: * @return the number of scripts.
michael@0: */
michael@0: int32_t getScriptCount() const;
michael@0:
michael@0: #if !UCONFIG_NO_NORMALIZATION
michael@0:
michael@0: /**
michael@0: * Find the "tightest" restriction level that the identifier satisfies.
michael@0: *
michael@0: * @return the restriction level.
michael@0: * @internal
michael@0: */
michael@0: URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
michael@0:
michael@0: #endif /*!UCONFIG_NO_NORMALIZATION */
michael@0:
michael@0: UnicodeString toString() const;
michael@0:
michael@0: /**
michael@0: * Produce a readable string of alternates.
michael@0: *
michael@0: * @param alternates a UHashtable of UScriptSets.
michael@0: * Keys only, no meaningful values in the UHash.
michael@0: * @return display form
michael@0: * @internal
michael@0: */
michael@0: static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
michael@0:
michael@0: /**
michael@0: * Static memory cleanup function.
michael@0: * @internal
michael@0: */
michael@0: static UBool cleanup();
michael@0: private:
michael@0:
michael@0: IdentifierInfo & clear();
michael@0: UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
michael@0:
michael@0: UnicodeString *fIdentifier;
michael@0: ScriptSet *fRequiredScripts;
michael@0: UHashtable *fScriptSetSet;
michael@0: ScriptSet *fCommonAmongAlternates;
michael@0: UnicodeSet *fNumerics;
michael@0: UnicodeSet *fIdentifierProfile;
michael@0:
michael@0: static UnicodeSet *ASCII;
michael@0: static ScriptSet *JAPANESE;
michael@0: static ScriptSet *CHINESE;
michael@0: static ScriptSet *KOREAN;
michael@0: static ScriptSet *CONFUSABLE_WITH_LATIN;
michael@0:
michael@0:
michael@0:
michael@0: };
michael@0:
michael@0: U_NAMESPACE_END
michael@0:
michael@0: #endif // __IDENTIFIER_INFO_H__
michael@0: