intl/icu/source/i18n/tridpars.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/tridpars.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,361 @@
     1.4 +/*
     1.5 + **************************************************************************
     1.6 + *   Copyright (c) 2002-2010, International Business Machines Corporation *
     1.7 + *   and others.  All Rights Reserved.                                    *
     1.8 + **************************************************************************
     1.9 + *   Date        Name        Description                                  *
    1.10 + *   01/28/2002  aliu        Creation.                                    *
    1.11 + **************************************************************************
    1.12 + */
    1.13 +#ifndef TRIDPARS_H
    1.14 +#define TRIDPARS_H
    1.15 +
    1.16 +#include "unicode/utypes.h"
    1.17 +
    1.18 +#if !UCONFIG_NO_TRANSLITERATION
    1.19 +
    1.20 +#include "unicode/uobject.h"
    1.21 +#include "unicode/unistr.h"
    1.22 +
    1.23 +U_NAMESPACE_BEGIN
    1.24 +
    1.25 +class Transliterator;
    1.26 +class UnicodeSet;
    1.27 +class UVector;
    1.28 +
    1.29 +/**
    1.30 + * Parsing component for transliterator IDs.  This class contains only
    1.31 + * static members; it cannot be instantiated.  Methods in this class
    1.32 + * parse various ID formats, including the following:
    1.33 + *
    1.34 + * A basic ID, which contains source, target, and variant, but no
    1.35 + * filter and no explicit inverse.  Examples include
    1.36 + * "Latin-Greek/UNGEGN" and "Null".
    1.37 + *
    1.38 + * A single ID, which is a basic ID plus optional filter and optional
    1.39 + * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
    1.40 + * "Lower (Upper)".
    1.41 + *
    1.42 + * A compound ID, which is a sequence of one or more single IDs,
    1.43 + * separated by semicolons, with optional forward and reverse global
    1.44 + * filters.  The global filters are UnicodeSet patterns prepended or
    1.45 + * appended to the IDs, separated by semicolons.  An appended filter
    1.46 + * must be enclosed in parentheses and applies in the reverse
    1.47 + * direction.
    1.48 + *
    1.49 + * @author Alan Liu
    1.50 + */
    1.51 +class TransliteratorIDParser /* not : public UObject because all methods are static */ {
    1.52 +
    1.53 + public:
    1.54 +
    1.55 +    /**
    1.56 +     * A structure containing the parsed data of a filtered ID, that
    1.57 +     * is, a basic ID optionally with a filter.
    1.58 +     *
    1.59 +     * 'source' and 'target' will always be non-null.  The 'variant'
    1.60 +     * will be non-null only if a non-empty variant was parsed.
    1.61 +     *
    1.62 +     * 'sawSource' is true if there was an explicit source in the
    1.63 +     * parsed id.  If there was no explicit source, then an implied
    1.64 +     * source of ANY is returned and 'sawSource' is set to false.
    1.65 +     * 
    1.66 +     * 'filter' is the parsed filter pattern, or null if there was no
    1.67 +     * filter.
    1.68 +     */
    1.69 +    class Specs : public UMemory {
    1.70 +    public:
    1.71 +        UnicodeString source; // not null
    1.72 +        UnicodeString target; // not null
    1.73 +        UnicodeString variant; // may be null
    1.74 +        UnicodeString filter; // may be null
    1.75 +        UBool sawSource;
    1.76 +        Specs(const UnicodeString& s, const UnicodeString& t,
    1.77 +              const UnicodeString& v, UBool sawS,
    1.78 +              const UnicodeString& f);
    1.79 +
    1.80 +    private:
    1.81 +
    1.82 +        Specs(const Specs &other); // forbid copying of this class
    1.83 +        Specs &operator=(const Specs &other); // forbid copying of this class
    1.84 +    };
    1.85 +
    1.86 +    /**
    1.87 +     * A structure containing the canonicalized data of a filtered ID,
    1.88 +     * that is, a basic ID optionally with a filter.
    1.89 +     *
    1.90 +     * 'canonID' is always non-null.  It may be the empty string "".
    1.91 +     * It is the id that should be assigned to the created
    1.92 +     * transliterator.  It _cannot_ be instantiated directly.
    1.93 +     *
    1.94 +     * 'basicID' is always non-null and non-empty.  It is always of
    1.95 +     * the form S-T or S-T/V.  It is designed to be fed to low-level
    1.96 +     * instantiation code that only understands these two formats.
    1.97 +     *
    1.98 +     * 'filter' may be null, if there is none, or non-null and
    1.99 +     * non-empty.
   1.100 +     */
   1.101 +    class SingleID : public UMemory {
   1.102 +    public:
   1.103 +        UnicodeString canonID;
   1.104 +        UnicodeString basicID;
   1.105 +        UnicodeString filter;
   1.106 +        SingleID(const UnicodeString& c, const UnicodeString& b,
   1.107 +                 const UnicodeString& f);
   1.108 +        SingleID(const UnicodeString& c, const UnicodeString& b);
   1.109 +        Transliterator* createInstance();
   1.110 +
   1.111 +    private:
   1.112 +
   1.113 +        SingleID(const SingleID &other); // forbid copying of this class
   1.114 +        SingleID &operator=(const SingleID &other); // forbid copying of this class
   1.115 +    };
   1.116 +
   1.117 +    /**
   1.118 +     * Parse a filter ID, that is, an ID of the general form
   1.119 +     * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
   1.120 +     * @param id the id to be parsed
   1.121 +     * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.122 +     * the first character to parse.  On output, the position after
   1.123 +     * the last character parsed.
   1.124 +     * @return a SingleID object or null if the parse fails
   1.125 +     */
   1.126 +    static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
   1.127 +
   1.128 +    /**
   1.129 +     * Parse a single ID, that is, an ID of the general form
   1.130 +     * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
   1.131 +     * optional, the filters optional, and the variants optional.
   1.132 +     * @param id the id to be parsed
   1.133 +     * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.134 +     * the first character to parse.  On output, the position after
   1.135 +     * the last character parsed.
   1.136 +     * @param dir the direction.  If the direction is REVERSE then the
   1.137 +     * SingleID is constructed for the reverse direction.
   1.138 +     * @return a SingleID object or null
   1.139 +     */
   1.140 +    static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
   1.141 +                                  int32_t dir, UErrorCode& status);
   1.142 +
   1.143 +    /**
   1.144 +     * Parse a global filter of the form "[f]" or "([f])", depending
   1.145 +     * on 'withParens'.
   1.146 +     * @param id the pattern the parse
   1.147 +     * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.148 +     * the first character to parse.  On output, the position after
   1.149 +     * the last character parsed.
   1.150 +     * @param dir the direction.
   1.151 +     * @param withParens INPUT-OUTPUT parameter.  On entry, if
   1.152 +     * withParens[0] is 0, then parens are disallowed.  If it is 1,
   1.153 +     * then parens are required.  If it is -1, then parens are
   1.154 +     * optional, and the return result will be set to 0 or 1.
   1.155 +     * @param canonID OUTPUT parameter.  The pattern for the filter
   1.156 +     * added to the canonID, either at the end, if dir is FORWARD, or
   1.157 +     * at the start, if dir is REVERSE.  The pattern will be enclosed
   1.158 +     * in parentheses if appropriate, and will be suffixed with an
   1.159 +     * ID_DELIM character.  May be null.
   1.160 +     * @return a UnicodeSet object or null.  A non-null results
   1.161 +     * indicates a successful parse, regardless of whether the filter
   1.162 +     * applies to the given direction.  The caller should discard it
   1.163 +     * if withParens != (dir == REVERSE).
   1.164 +     */
   1.165 +    static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
   1.166 +                                         int32_t dir,
   1.167 +                                         int32_t& withParens,
   1.168 +                                         UnicodeString* canonID);
   1.169 +
   1.170 +    /**
   1.171 +     * Parse a compound ID, consisting of an optional forward global
   1.172 +     * filter, a separator, one or more single IDs delimited by
   1.173 +     * separators, an an optional reverse global filter.  The
   1.174 +     * separator is a semicolon.  The global filters are UnicodeSet
   1.175 +     * patterns.  The reverse global filter must be enclosed in
   1.176 +     * parentheses.
   1.177 +     * @param id the pattern the parse
   1.178 +     * @param dir the direction.
   1.179 +     * @param canonID OUTPUT parameter that receives the canonical ID,
   1.180 +     * consisting of canonical IDs for all elements, as returned by
   1.181 +     * parseSingleID(), separated by semicolons.  Previous contents
   1.182 +     * are discarded.
   1.183 +     * @param list OUTPUT parameter that receives a list of SingleID
   1.184 +     * objects representing the parsed IDs.  Previous contents are
   1.185 +     * discarded.
   1.186 +     * @param globalFilter OUTPUT parameter that receives a pointer to
   1.187 +     * a newly created global filter for this ID in this direction, or
   1.188 +     * null if there is none.
   1.189 +     * @return true if the parse succeeds, that is, if the entire
   1.190 +     * id is consumed without syntax error.
   1.191 +     */
   1.192 +    static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
   1.193 +                                 UnicodeString& canonID,
   1.194 +                                 UVector& list,
   1.195 +                                 UnicodeSet*& globalFilter);
   1.196 +
   1.197 +    /**
   1.198 +     * Convert the elements of the 'list' vector, which are SingleID
   1.199 +     * objects, into actual Transliterator objects.  In the course of
   1.200 +     * this, some (or all) entries may be removed.  If all entries
   1.201 +     * are removed, the Null transliterator will be added.
   1.202 +     *
   1.203 +     * Delete entries with empty basicIDs; these are generated by
   1.204 +     * elements like "(A)" in the forward direction, or "A()" in
   1.205 +     * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
   1.206 +     * SingleID entries to actual transliterators.
   1.207 +     *
   1.208 +     * @param list vector of SingleID objects.  On exit, vector
   1.209 +     * of one or more Transliterators.
   1.210 +     * @param ec Output param to receive a success or an error code.
   1.211 +     * @return new value of insertIndex.  The index will shift if
   1.212 +     * there are empty items, like "(Lower)", with indices less than
   1.213 +     * insertIndex.
   1.214 +     */
   1.215 +    static void instantiateList(UVector& list,
   1.216 +                                UErrorCode& ec);
   1.217 +
   1.218 +    /**
   1.219 +     * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
   1.220 +     * S-T/V, or S/V-T.  If the source is missing, return a source of
   1.221 +     * ANY.
   1.222 +     * @param id the id string, in any of several forms
   1.223 +     * @param source          the given source.
   1.224 +     * @param target          the given target.
   1.225 +     * @param variant         the given variant
   1.226 +     * @param isSourcePresent If TRUE then the source is present. 
   1.227 +     *                        If the source is not present, ANY will be
   1.228 +     *                        given as the source, and isSourcePresent will be null
   1.229 +     * @return an array of 4 strings: source, target, variant, and
   1.230 +     * isSourcePresent.  If the source is not present, ANY will be
   1.231 +     * given as the source, and isSourcePresent will be null.  Otherwise
   1.232 +     * isSourcePresent will be non-null.  The target may be empty if the
   1.233 +     * id is not well-formed.  The variant may be empty.
   1.234 +     */
   1.235 +    static void IDtoSTV(const UnicodeString& id,
   1.236 +                        UnicodeString& source,
   1.237 +                        UnicodeString& target,
   1.238 +                        UnicodeString& variant,
   1.239 +                        UBool& isSourcePresent);
   1.240 +
   1.241 +    /**
   1.242 +     * Given source, target, and variant strings, concatenate them into a
   1.243 +     * full ID.  If the source is empty, then "Any" will be used for the
   1.244 +     * source, so the ID will always be of the form s-t/v or s-t.
   1.245 +     */
   1.246 +    static void STVtoID(const UnicodeString& source,
   1.247 +                        const UnicodeString& target,
   1.248 +                        const UnicodeString& variant,
   1.249 +                        UnicodeString& id);
   1.250 +
   1.251 +    /**
   1.252 +     * Register two targets as being inverses of one another.  For
   1.253 +     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
   1.254 +     * Transliterator to form the following inverse relationships:
   1.255 +     *
   1.256 +     * <pre>NFC => NFD
   1.257 +     * Any-NFC => Any-NFD
   1.258 +     * NFD => NFC
   1.259 +     * Any-NFD => Any-NFC</pre>
   1.260 +     *
   1.261 +     * (Without the special inverse registration, the inverse of NFC
   1.262 +     * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
   1.263 +     * that the presence or absence of "Any-" is preserved.
   1.264 +     *
   1.265 +     * <p>The relationship is symmetrical; registering (a, b) is
   1.266 +     * equivalent to registering (b, a).
   1.267 +     *
   1.268 +     * <p>The relevant IDs must still be registered separately as
   1.269 +     * factories or classes.
   1.270 +     *
   1.271 +     * <p>Only the targets are specified.  Special inverses always
   1.272 +     * have the form Any-Target1 <=> Any-Target2.  The target should
   1.273 +     * have canonical casing (the casing desired to be produced when
   1.274 +     * an inverse is formed) and should contain no whitespace or other
   1.275 +     * extraneous characters.
   1.276 +     *
   1.277 +     * @param target the target against which to register the inverse
   1.278 +     * @param inverseTarget the inverse of target, that is
   1.279 +     * Any-target.getInverse() => Any-inverseTarget
   1.280 +     * @param bidirectional if true, register the reverse relation
   1.281 +     * as well, that is, Any-inverseTarget.getInverse() => Any-target
   1.282 +     */
   1.283 +    static void registerSpecialInverse(const UnicodeString& target,
   1.284 +                                       const UnicodeString& inverseTarget,
   1.285 +                                       UBool bidirectional,
   1.286 +                                       UErrorCode &status);
   1.287 +
   1.288 +    /**
   1.289 +     * Free static memory.
   1.290 +     */
   1.291 +    static void cleanup();
   1.292 +
   1.293 + private:
   1.294 +    //----------------------------------------------------------------
   1.295 +    // Private implementation
   1.296 +    //----------------------------------------------------------------
   1.297 +
   1.298 +    // forbid instantiation
   1.299 +    TransliteratorIDParser();
   1.300 +
   1.301 +    /**
   1.302 +     * Parse an ID into component pieces.  Take IDs of the form T,
   1.303 +     * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
   1.304 +     * source of ANY.
   1.305 +     * @param id the id string, in any of several forms
   1.306 +     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
   1.307 +     * offset of the first character to parse in id.  On output,
   1.308 +     * pos[0] is the offset after the last parsed character.  If the
   1.309 +     * parse failed, pos[0] will be unchanged.
   1.310 +     * @param allowFilter if true, a UnicodeSet pattern is allowed
   1.311 +     * at any location between specs or delimiters, and is returned
   1.312 +     * as the fifth string in the array.
   1.313 +     * @return a Specs object, or null if the parse failed.  If
   1.314 +     * neither source nor target was seen in the parsed id, then the
   1.315 +     * parse fails.  If allowFilter is true, then the parsed filter
   1.316 +     * pattern is returned in the Specs object, otherwise the returned
   1.317 +     * filter reference is null.  If the parse fails for any reason
   1.318 +     * null is returned.
   1.319 +     */
   1.320 +    static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
   1.321 +                                UBool allowFilter);
   1.322 +
   1.323 +    /**
   1.324 +     * Givens a Specs object, convert it to a SingleID object.  The
   1.325 +     * Spec object is a more unprocessed parse result.  The SingleID
   1.326 +     * object contains information about canonical and basic IDs.
   1.327 +     * @param specs the given Specs object.
   1.328 +     * @param dir   either FORWARD or REVERSE.
   1.329 +     * @return a SingleID; never returns null.  Returned object always
   1.330 +     * has 'filter' field of null.
   1.331 +     */
   1.332 +    static SingleID* specsToID(const Specs* specs, int32_t dir);
   1.333 +
   1.334 +    /**
   1.335 +     * Given a Specs object, return a SingleID representing the
   1.336 +     * special inverse of that ID.  If there is no special inverse
   1.337 +     * then return null.
   1.338 +     * @param specs the given Specs.
   1.339 +     * @return a SingleID or null.  Returned object always has
   1.340 +     * 'filter' field of null.
   1.341 +     */
   1.342 +    static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
   1.343 +
   1.344 +    /**
   1.345 +     * Glue method to get around access problems in C++.
   1.346 +     * @param id the id string for the transliterator, in any of several forms
   1.347 +     * @param canonID the given canonical ID
   1.348 +     */
   1.349 +    static Transliterator* createBasicInstance(const UnicodeString& id,
   1.350 +                                               const UnicodeString* canonID);
   1.351 +
   1.352 +    /**
   1.353 +     * Initialize static memory.
   1.354 +     */
   1.355 +    static void init(UErrorCode &status);
   1.356 +
   1.357 +    friend class SingleID;
   1.358 +};
   1.359 +
   1.360 +U_NAMESPACE_END
   1.361 +
   1.362 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.363 +
   1.364 +#endif

mercurial