michael@0: /* michael@0: ************************************************************************** michael@0: * Copyright (c) 2002-2010, International Business Machines Corporation * michael@0: * and others. All Rights Reserved. * michael@0: ************************************************************************** michael@0: * Date Name Description * michael@0: * 01/28/2002 aliu Creation. * michael@0: ************************************************************************** michael@0: */ michael@0: #ifndef TRIDPARS_H michael@0: #define TRIDPARS_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/unistr.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class Transliterator; michael@0: class UnicodeSet; michael@0: class UVector; michael@0: michael@0: /** michael@0: * Parsing component for transliterator IDs. This class contains only michael@0: * static members; it cannot be instantiated. Methods in this class michael@0: * parse various ID formats, including the following: michael@0: * michael@0: * A basic ID, which contains source, target, and variant, but no michael@0: * filter and no explicit inverse. Examples include michael@0: * "Latin-Greek/UNGEGN" and "Null". michael@0: * michael@0: * A single ID, which is a basic ID plus optional filter and optional michael@0: * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and michael@0: * "Lower (Upper)". michael@0: * michael@0: * A compound ID, which is a sequence of one or more single IDs, michael@0: * separated by semicolons, with optional forward and reverse global michael@0: * filters. The global filters are UnicodeSet patterns prepended or michael@0: * appended to the IDs, separated by semicolons. An appended filter michael@0: * must be enclosed in parentheses and applies in the reverse michael@0: * direction. michael@0: * michael@0: * @author Alan Liu michael@0: */ michael@0: class TransliteratorIDParser /* not : public UObject because all methods are static */ { michael@0: michael@0: public: michael@0: michael@0: /** michael@0: * A structure containing the parsed data of a filtered ID, that michael@0: * is, a basic ID optionally with a filter. michael@0: * michael@0: * 'source' and 'target' will always be non-null. The 'variant' michael@0: * will be non-null only if a non-empty variant was parsed. michael@0: * michael@0: * 'sawSource' is true if there was an explicit source in the michael@0: * parsed id. If there was no explicit source, then an implied michael@0: * source of ANY is returned and 'sawSource' is set to false. michael@0: * michael@0: * 'filter' is the parsed filter pattern, or null if there was no michael@0: * filter. michael@0: */ michael@0: class Specs : public UMemory { michael@0: public: michael@0: UnicodeString source; // not null michael@0: UnicodeString target; // not null michael@0: UnicodeString variant; // may be null michael@0: UnicodeString filter; // may be null michael@0: UBool sawSource; michael@0: Specs(const UnicodeString& s, const UnicodeString& t, michael@0: const UnicodeString& v, UBool sawS, michael@0: const UnicodeString& f); michael@0: michael@0: private: michael@0: michael@0: Specs(const Specs &other); // forbid copying of this class michael@0: Specs &operator=(const Specs &other); // forbid copying of this class michael@0: }; michael@0: michael@0: /** michael@0: * A structure containing the canonicalized data of a filtered ID, michael@0: * that is, a basic ID optionally with a filter. michael@0: * michael@0: * 'canonID' is always non-null. It may be the empty string "". michael@0: * It is the id that should be assigned to the created michael@0: * transliterator. It _cannot_ be instantiated directly. michael@0: * michael@0: * 'basicID' is always non-null and non-empty. It is always of michael@0: * the form S-T or S-T/V. It is designed to be fed to low-level michael@0: * instantiation code that only understands these two formats. michael@0: * michael@0: * 'filter' may be null, if there is none, or non-null and michael@0: * non-empty. michael@0: */ michael@0: class SingleID : public UMemory { michael@0: public: michael@0: UnicodeString canonID; michael@0: UnicodeString basicID; michael@0: UnicodeString filter; michael@0: SingleID(const UnicodeString& c, const UnicodeString& b, michael@0: const UnicodeString& f); michael@0: SingleID(const UnicodeString& c, const UnicodeString& b); michael@0: Transliterator* createInstance(); michael@0: michael@0: private: michael@0: michael@0: SingleID(const SingleID &other); // forbid copying of this class michael@0: SingleID &operator=(const SingleID &other); // forbid copying of this class michael@0: }; michael@0: michael@0: /** michael@0: * Parse a filter ID, that is, an ID of the general form michael@0: * "[f1] s1-t1/v1", with the filters optional, and the variants optional. michael@0: * @param id the id to be parsed michael@0: * @param pos INPUT-OUTPUT parameter. On input, the position of michael@0: * the first character to parse. On output, the position after michael@0: * the last character parsed. michael@0: * @return a SingleID object or null if the parse fails michael@0: */ michael@0: static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos); michael@0: michael@0: /** michael@0: * Parse a single ID, that is, an ID of the general form michael@0: * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element michael@0: * optional, the filters optional, and the variants optional. michael@0: * @param id the id to be parsed michael@0: * @param pos INPUT-OUTPUT parameter. On input, the position of michael@0: * the first character to parse. On output, the position after michael@0: * the last character parsed. michael@0: * @param dir the direction. If the direction is REVERSE then the michael@0: * SingleID is constructed for the reverse direction. michael@0: * @return a SingleID object or null michael@0: */ michael@0: static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos, michael@0: int32_t dir, UErrorCode& status); michael@0: michael@0: /** michael@0: * Parse a global filter of the form "[f]" or "([f])", depending michael@0: * on 'withParens'. michael@0: * @param id the pattern the parse michael@0: * @param pos INPUT-OUTPUT parameter. On input, the position of michael@0: * the first character to parse. On output, the position after michael@0: * the last character parsed. michael@0: * @param dir the direction. michael@0: * @param withParens INPUT-OUTPUT parameter. On entry, if michael@0: * withParens[0] is 0, then parens are disallowed. If it is 1, michael@0: * then parens are required. If it is -1, then parens are michael@0: * optional, and the return result will be set to 0 or 1. michael@0: * @param canonID OUTPUT parameter. The pattern for the filter michael@0: * added to the canonID, either at the end, if dir is FORWARD, or michael@0: * at the start, if dir is REVERSE. The pattern will be enclosed michael@0: * in parentheses if appropriate, and will be suffixed with an michael@0: * ID_DELIM character. May be null. michael@0: * @return a UnicodeSet object or null. A non-null results michael@0: * indicates a successful parse, regardless of whether the filter michael@0: * applies to the given direction. The caller should discard it michael@0: * if withParens != (dir == REVERSE). michael@0: */ michael@0: static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos, michael@0: int32_t dir, michael@0: int32_t& withParens, michael@0: UnicodeString* canonID); michael@0: michael@0: /** michael@0: * Parse a compound ID, consisting of an optional forward global michael@0: * filter, a separator, one or more single IDs delimited by michael@0: * separators, an an optional reverse global filter. The michael@0: * separator is a semicolon. The global filters are UnicodeSet michael@0: * patterns. The reverse global filter must be enclosed in michael@0: * parentheses. michael@0: * @param id the pattern the parse michael@0: * @param dir the direction. michael@0: * @param canonID OUTPUT parameter that receives the canonical ID, michael@0: * consisting of canonical IDs for all elements, as returned by michael@0: * parseSingleID(), separated by semicolons. Previous contents michael@0: * are discarded. michael@0: * @param list OUTPUT parameter that receives a list of SingleID michael@0: * objects representing the parsed IDs. Previous contents are michael@0: * discarded. michael@0: * @param globalFilter OUTPUT parameter that receives a pointer to michael@0: * a newly created global filter for this ID in this direction, or michael@0: * null if there is none. michael@0: * @return true if the parse succeeds, that is, if the entire michael@0: * id is consumed without syntax error. michael@0: */ michael@0: static UBool parseCompoundID(const UnicodeString& id, int32_t dir, michael@0: UnicodeString& canonID, michael@0: UVector& list, michael@0: UnicodeSet*& globalFilter); michael@0: michael@0: /** michael@0: * Convert the elements of the 'list' vector, which are SingleID michael@0: * objects, into actual Transliterator objects. In the course of michael@0: * this, some (or all) entries may be removed. If all entries michael@0: * are removed, the Null transliterator will be added. michael@0: * michael@0: * Delete entries with empty basicIDs; these are generated by michael@0: * elements like "(A)" in the forward direction, or "A()" in michael@0: * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert michael@0: * SingleID entries to actual transliterators. michael@0: * michael@0: * @param list vector of SingleID objects. On exit, vector michael@0: * of one or more Transliterators. michael@0: * @param ec Output param to receive a success or an error code. michael@0: * @return new value of insertIndex. The index will shift if michael@0: * there are empty items, like "(Lower)", with indices less than michael@0: * insertIndex. michael@0: */ michael@0: static void instantiateList(UVector& list, michael@0: UErrorCode& ec); michael@0: michael@0: /** michael@0: * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, michael@0: * S-T/V, or S/V-T. If the source is missing, return a source of michael@0: * ANY. michael@0: * @param id the id string, in any of several forms michael@0: * @param source the given source. michael@0: * @param target the given target. michael@0: * @param variant the given variant michael@0: * @param isSourcePresent If TRUE then the source is present. michael@0: * If the source is not present, ANY will be michael@0: * given as the source, and isSourcePresent will be null michael@0: * @return an array of 4 strings: source, target, variant, and michael@0: * isSourcePresent. If the source is not present, ANY will be michael@0: * given as the source, and isSourcePresent will be null. Otherwise michael@0: * isSourcePresent will be non-null. The target may be empty if the michael@0: * id is not well-formed. The variant may be empty. michael@0: */ michael@0: static void IDtoSTV(const UnicodeString& id, michael@0: UnicodeString& source, michael@0: UnicodeString& target, michael@0: UnicodeString& variant, michael@0: UBool& isSourcePresent); michael@0: michael@0: /** michael@0: * Given source, target, and variant strings, concatenate them into a michael@0: * full ID. If the source is empty, then "Any" will be used for the michael@0: * source, so the ID will always be of the form s-t/v or s-t. michael@0: */ michael@0: static void STVtoID(const UnicodeString& source, michael@0: const UnicodeString& target, michael@0: const UnicodeString& variant, michael@0: UnicodeString& id); michael@0: michael@0: /** michael@0: * Register two targets as being inverses of one another. For michael@0: * example, calling registerSpecialInverse("NFC", "NFD", true) causes michael@0: * Transliterator to form the following inverse relationships: michael@0: * michael@0: *
NFC => NFD michael@0: * Any-NFC => Any-NFD michael@0: * NFD => NFC michael@0: * Any-NFD => Any-NFCmichael@0: * michael@0: * (Without the special inverse registration, the inverse of NFC michael@0: * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but michael@0: * that the presence or absence of "Any-" is preserved. michael@0: * michael@0: *
The relationship is symmetrical; registering (a, b) is michael@0: * equivalent to registering (b, a). michael@0: * michael@0: *
The relevant IDs must still be registered separately as michael@0: * factories or classes. michael@0: * michael@0: *
Only the targets are specified. Special inverses always michael@0: * have the form Any-Target1 <=> Any-Target2. The target should michael@0: * have canonical casing (the casing desired to be produced when michael@0: * an inverse is formed) and should contain no whitespace or other michael@0: * extraneous characters. michael@0: * michael@0: * @param target the target against which to register the inverse michael@0: * @param inverseTarget the inverse of target, that is michael@0: * Any-target.getInverse() => Any-inverseTarget michael@0: * @param bidirectional if true, register the reverse relation michael@0: * as well, that is, Any-inverseTarget.getInverse() => Any-target michael@0: */ michael@0: static void registerSpecialInverse(const UnicodeString& target, michael@0: const UnicodeString& inverseTarget, michael@0: UBool bidirectional, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Free static memory. michael@0: */ michael@0: static void cleanup(); michael@0: michael@0: private: michael@0: //---------------------------------------------------------------- michael@0: // Private implementation michael@0: //---------------------------------------------------------------- michael@0: michael@0: // forbid instantiation michael@0: TransliteratorIDParser(); michael@0: michael@0: /** michael@0: * Parse an ID into component pieces. Take IDs of the form T, michael@0: * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a michael@0: * source of ANY. michael@0: * @param id the id string, in any of several forms michael@0: * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the michael@0: * offset of the first character to parse in id. On output, michael@0: * pos[0] is the offset after the last parsed character. If the michael@0: * parse failed, pos[0] will be unchanged. michael@0: * @param allowFilter if true, a UnicodeSet pattern is allowed michael@0: * at any location between specs or delimiters, and is returned michael@0: * as the fifth string in the array. michael@0: * @return a Specs object, or null if the parse failed. If michael@0: * neither source nor target was seen in the parsed id, then the michael@0: * parse fails. If allowFilter is true, then the parsed filter michael@0: * pattern is returned in the Specs object, otherwise the returned michael@0: * filter reference is null. If the parse fails for any reason michael@0: * null is returned. michael@0: */ michael@0: static Specs* parseFilterID(const UnicodeString& id, int32_t& pos, michael@0: UBool allowFilter); michael@0: michael@0: /** michael@0: * Givens a Specs object, convert it to a SingleID object. The michael@0: * Spec object is a more unprocessed parse result. The SingleID michael@0: * object contains information about canonical and basic IDs. michael@0: * @param specs the given Specs object. michael@0: * @param dir either FORWARD or REVERSE. michael@0: * @return a SingleID; never returns null. Returned object always michael@0: * has 'filter' field of null. michael@0: */ michael@0: static SingleID* specsToID(const Specs* specs, int32_t dir); michael@0: michael@0: /** michael@0: * Given a Specs object, return a SingleID representing the michael@0: * special inverse of that ID. If there is no special inverse michael@0: * then return null. michael@0: * @param specs the given Specs. michael@0: * @return a SingleID or null. Returned object always has michael@0: * 'filter' field of null. michael@0: */ michael@0: static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status); michael@0: michael@0: /** michael@0: * Glue method to get around access problems in C++. michael@0: * @param id the id string for the transliterator, in any of several forms michael@0: * @param canonID the given canonical ID michael@0: */ michael@0: static Transliterator* createBasicInstance(const UnicodeString& id, michael@0: const UnicodeString* canonID); michael@0: michael@0: /** michael@0: * Initialize static memory. michael@0: */ michael@0: static void init(UErrorCode &status); michael@0: michael@0: friend class SingleID; michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: #endif