diff -r 000000000000 -r 6474c204b198 intl/icu/source/i18n/identifier_info.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intl/icu/source/i18n/identifier_info.h Wed Dec 31 06:09:35 2014 +0100
@@ -0,0 +1,204 @@
+/*
+**********************************************************************
+* Copyright (C) 2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*
+* indentifier_info.h
+*
+* created on: 2013 Jan 7
+* created by: Andy Heninger
+*/
+
+#ifndef __IDENTIFIER_INFO_H__
+#define __IDENTIFIER_INFO_H__
+
+#include "unicode/utypes.h"
+
+#include "unicode/uniset.h"
+#include "unicode/uspoof.h"
+#include "uhash.h"
+
+U_NAMESPACE_BEGIN
+
+class ScriptSet;
+
+// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
+
+/**
+ * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
+ * then setIdentifier. Available methods include:
+ *
+ * - call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
+ * each of these.
+ *
- call getAlternates to get cases where a character is not limited to a single script. For example, it could be
+ * either Katakana or Hiragana.
+ *
- call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
+ *
- call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
+ * the identifier.
+ *
- call getRestrictionLevel to see what the UTS36 restriction level is.
+ *
+ *
+ * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
+ */
+class U_I18N_API IdentifierInfo : public UMemory {
+
+ public:
+ /**
+ * Create an identifier info object. Subsequently, call setIdentifier(), etc.
+ * @internal
+ */
+ IdentifierInfo(UErrorCode &status);
+
+ /**
+ * Destructor
+ */
+ virtual ~IdentifierInfo();
+
+ private:
+ /* Disallow copying for now. Can be added if there's a need. */
+ IdentifierInfo(const IdentifierInfo &other);
+
+ public:
+
+ /**
+ * Set the identifier profile: the characters that are to be allowed in the identifier.
+ *
+ * @param identifierProfile the characters that are to be allowed in the identifier
+ * @return this
+ * @internal
+ */
+ IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
+
+ /**
+ * Get the identifier profile: the characters that are to be allowed in the identifier.
+ *
+ * @return The characters that are to be allowed in the identifier.
+ * @internal
+ */
+ const UnicodeSet &getIdentifierProfile() const;
+
+
+ /**
+ * Set an identifier to analyze. Afterwards, call methods like getScripts()
+ *
+ * @param identifier the identifier to analyze
+ * @param status Errorcode, set if errors occur.
+ * @return this
+ * @internal
+ */
+ IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
+
+
+ /**
+ * Get the identifier that was analyzed. The returned string is owned by the ICU library,
+ * and must not be deleted by the caller.
+ *
+ * @return the identifier that was analyzed.
+ * @internal
+ */
+ const UnicodeString *getIdentifier() const;
+
+
+ /**
+ * Get the scripts found in the identifiers.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ const ScriptSet *getScripts() const;
+
+ /**
+ * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
+ * the set consisting of those scripts will be returned.
+ *
+ * @return a uhash, with each key being of type (ScriptSet *).
+ * This is a set, not a map, so the value stored in the uhash is not relevant.
+ * (It is, in fact, 1).
+ * Ownership of the uhash and its contents remains with the IndetifierInfo object,
+ * and remains valid until a new identifer is set or until the object is deleted.
+ * @internal
+ */
+ const UHashtable *getAlternates() const;
+
+ /**
+ * Get the representative characters (zeros) for the numerics found in the identifier.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ const UnicodeSet *getNumerics() const;
+
+ /**
+ * Find out which scripts are in common among the alternates.
+ *
+ * @return the set of scripts that are in common among the alternates.
+ * @internal
+ */
+ const ScriptSet *getCommonAmongAlternates() const;
+
+ /**
+ * Get the number of scripts appearing in the identifier.
+ * Note: Common and Inherited scripts are omitted from the count.
+ * Note: Result may be high when the identifier contains characters
+ * with alternate scripts. The distinction between
+ * 0, 1 and > 1 will remain valid, however.
+ * @return the number of scripts.
+ */
+ int32_t getScriptCount() const;
+
+#if !UCONFIG_NO_NORMALIZATION
+
+ /**
+ * Find the "tightest" restriction level that the identifier satisfies.
+ *
+ * @return the restriction level.
+ * @internal
+ */
+ URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
+
+#endif /*!UCONFIG_NO_NORMALIZATION */
+
+ UnicodeString toString() const;
+
+ /**
+ * Produce a readable string of alternates.
+ *
+ * @param alternates a UHashtable of UScriptSets.
+ * Keys only, no meaningful values in the UHash.
+ * @return display form
+ * @internal
+ */
+ static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
+
+ /**
+ * Static memory cleanup function.
+ * @internal
+ */
+ static UBool cleanup();
+ private:
+
+ IdentifierInfo & clear();
+ UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
+
+ UnicodeString *fIdentifier;
+ ScriptSet *fRequiredScripts;
+ UHashtable *fScriptSetSet;
+ ScriptSet *fCommonAmongAlternates;
+ UnicodeSet *fNumerics;
+ UnicodeSet *fIdentifierProfile;
+
+ static UnicodeSet *ASCII;
+ static ScriptSet *JAPANESE;
+ static ScriptSet *CHINESE;
+ static ScriptSet *KOREAN;
+ static ScriptSet *CONFUSABLE_WITH_LATIN;
+
+
+
+};
+
+U_NAMESPACE_END
+
+#endif // __IDENTIFIER_INFO_H__
+