intl/icu/source/i18n/identifier_info.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/identifier_info.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,204 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2013, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*
    1.10 +* indentifier_info.h
    1.11 +* 
    1.12 +* created on: 2013 Jan 7
    1.13 +* created by: Andy Heninger
    1.14 +*/
    1.15 +
    1.16 +#ifndef __IDENTIFIER_INFO_H__
    1.17 +#define __IDENTIFIER_INFO_H__
    1.18 +
    1.19 +#include "unicode/utypes.h"
    1.20 +
    1.21 +#include "unicode/uniset.h"
    1.22 +#include "unicode/uspoof.h"
    1.23 +#include "uhash.h"
    1.24 +
    1.25 +U_NAMESPACE_BEGIN
    1.26 +
    1.27 +class ScriptSet;
    1.28 +
    1.29 +// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
    1.30 +
    1.31 +/**
    1.32 + * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
    1.33 + * then setIdentifier. Available methods include:
    1.34 + * <ol>
    1.35 + * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
    1.36 + * each of these.
    1.37 + * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
    1.38 + * either Katakana or Hiragana.
    1.39 + * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
    1.40 + * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
    1.41 + * the identifier.
    1.42 + * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
    1.43 + * </ol>
    1.44 + * 
    1.45 + * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
    1.46 + */
    1.47 +class U_I18N_API IdentifierInfo : public UMemory {
    1.48 +
    1.49 +  public:
    1.50 +    /**
    1.51 +     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
    1.52 +     * @internal
    1.53 +     */
    1.54 +    IdentifierInfo(UErrorCode &status);
    1.55 +
    1.56 +    /**
    1.57 +      * Destructor
    1.58 +      */
    1.59 +    virtual ~IdentifierInfo();
    1.60 +
    1.61 +  private:
    1.62 +    /* Disallow copying for now. Can be added if there's a need. */
    1.63 +    IdentifierInfo(const IdentifierInfo &other);
    1.64 +
    1.65 +  public:
    1.66 +     
    1.67 +    /**
    1.68 +     * Set the identifier profile: the characters that are to be allowed in the identifier.
    1.69 +     * 
    1.70 +     * @param identifierProfile the characters that are to be allowed in the identifier
    1.71 +     * @return this
    1.72 +     * @internal
    1.73 +     */
    1.74 +    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
    1.75 +
    1.76 +    /**
    1.77 +     * Get the identifier profile: the characters that are to be allowed in the identifier.
    1.78 +     * 
    1.79 +     * @return The characters that are to be allowed in the identifier.
    1.80 +     * @internal
    1.81 +     */
    1.82 +    const UnicodeSet &getIdentifierProfile() const;
    1.83 +
    1.84 +
    1.85 +    /**
    1.86 +     * Set an identifier to analyze. Afterwards, call methods like getScripts()
    1.87 +     * 
    1.88 +     * @param identifier the identifier to analyze
    1.89 +     * @param status Errorcode, set if errors occur.
    1.90 +     * @return this
    1.91 +     * @internal
    1.92 +     */
    1.93 +    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
    1.94 +
    1.95 +
    1.96 +    /**
    1.97 +     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
    1.98 +     * and must not be deleted by the caller.
    1.99 +     * 
   1.100 +     * @return the identifier that was analyzed.
   1.101 +     * @internal
   1.102 +     */
   1.103 +    const UnicodeString *getIdentifier() const;
   1.104 +    
   1.105 +
   1.106 +    /**
   1.107 +     * Get the scripts found in the identifiers.
   1.108 +     * 
   1.109 +     * @return the set of explicit scripts.
   1.110 +     * @internal
   1.111 +     */
   1.112 +    const ScriptSet *getScripts() const;
   1.113 +
   1.114 +    /**
   1.115 +     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
   1.116 +     * the set consisting of those scripts will be returned.
   1.117 +     * 
   1.118 +     * @return a uhash, with each key being of type (ScriptSet *). 
   1.119 +     *         This is a set, not a map, so the value stored in the uhash is not relevant.
   1.120 +     *         (It is, in fact, 1).
   1.121 +     *         Ownership of the uhash and its contents remains with the IndetifierInfo object, 
   1.122 +     *         and remains valid until a new identifer is set or until the object is deleted.
   1.123 +     * @internal
   1.124 +     */
   1.125 +    const UHashtable *getAlternates() const;
   1.126 +
   1.127 +    /**
   1.128 +     * Get the representative characters (zeros) for the numerics found in the identifier.
   1.129 +     * 
   1.130 +     * @return the set of explicit scripts.
   1.131 +     * @internal
   1.132 +     */
   1.133 +    const UnicodeSet *getNumerics() const;
   1.134 +
   1.135 +    /**
   1.136 +     * Find out which scripts are in common among the alternates.
   1.137 +     * 
   1.138 +     * @return the set of scripts that are in common among the alternates.
   1.139 +     * @internal
   1.140 +     */
   1.141 +    const ScriptSet *getCommonAmongAlternates() const;
   1.142 +
   1.143 +    /**
   1.144 +      * Get the number of scripts appearing in the identifier.
   1.145 +      *   Note: Common and Inherited scripts are omitted from the count.
   1.146 +      *   Note: Result may be high when the identifier contains characters
   1.147 +      *         with alternate scripts. The distinction between
   1.148 +      *         0, 1 and > 1 will remain valid, however.
   1.149 +      * @return the number of scripts.
   1.150 +      */
   1.151 +    int32_t getScriptCount() const;
   1.152 +
   1.153 +#if !UCONFIG_NO_NORMALIZATION
   1.154 +
   1.155 +    /**
   1.156 +     * Find the "tightest" restriction level that the identifier satisfies.
   1.157 +     * 
   1.158 +     * @return the restriction level.
   1.159 +     * @internal
   1.160 +     */
   1.161 +    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
   1.162 +
   1.163 +#endif /*!UCONFIG_NO_NORMALIZATION */
   1.164 +
   1.165 +    UnicodeString toString() const;
   1.166 +
   1.167 +    /**
   1.168 +     * Produce a readable string of alternates.
   1.169 +     * 
   1.170 +     * @param alternates a UHashtable of UScriptSets.
   1.171 +     *        Keys only, no meaningful values in the UHash.
   1.172 +     * @return display form
   1.173 +     * @internal
   1.174 +     */
   1.175 +    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
   1.176 +
   1.177 +    /**
   1.178 +     * Static memory cleanup function.
   1.179 +     * @internal
   1.180 +     */
   1.181 +    static UBool      cleanup();
   1.182 +  private:
   1.183 +
   1.184 +    IdentifierInfo  & clear();
   1.185 +    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
   1.186 +
   1.187 +    UnicodeString     *fIdentifier;
   1.188 +    ScriptSet         *fRequiredScripts;
   1.189 +    UHashtable        *fScriptSetSet;
   1.190 +    ScriptSet         *fCommonAmongAlternates;
   1.191 +    UnicodeSet        *fNumerics;
   1.192 +    UnicodeSet        *fIdentifierProfile;
   1.193 +
   1.194 +    static UnicodeSet *ASCII;
   1.195 +    static ScriptSet  *JAPANESE;
   1.196 +    static ScriptSet  *CHINESE;
   1.197 +    static ScriptSet  *KOREAN;
   1.198 +    static ScriptSet  *CONFUSABLE_WITH_LATIN;
   1.199 +
   1.200 +
   1.201 +
   1.202 +};
   1.203 +
   1.204 +U_NAMESPACE_END
   1.205 +
   1.206 +#endif // __IDENTIFIER_INFO_H__
   1.207 +

mercurial