1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/tridpars.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,361 @@ 1.4 +/* 1.5 + ************************************************************************** 1.6 + * Copyright (c) 2002-2010, International Business Machines Corporation * 1.7 + * and others. All Rights Reserved. * 1.8 + ************************************************************************** 1.9 + * Date Name Description * 1.10 + * 01/28/2002 aliu Creation. * 1.11 + ************************************************************************** 1.12 + */ 1.13 +#ifndef TRIDPARS_H 1.14 +#define TRIDPARS_H 1.15 + 1.16 +#include "unicode/utypes.h" 1.17 + 1.18 +#if !UCONFIG_NO_TRANSLITERATION 1.19 + 1.20 +#include "unicode/uobject.h" 1.21 +#include "unicode/unistr.h" 1.22 + 1.23 +U_NAMESPACE_BEGIN 1.24 + 1.25 +class Transliterator; 1.26 +class UnicodeSet; 1.27 +class UVector; 1.28 + 1.29 +/** 1.30 + * Parsing component for transliterator IDs. This class contains only 1.31 + * static members; it cannot be instantiated. Methods in this class 1.32 + * parse various ID formats, including the following: 1.33 + * 1.34 + * A basic ID, which contains source, target, and variant, but no 1.35 + * filter and no explicit inverse. Examples include 1.36 + * "Latin-Greek/UNGEGN" and "Null". 1.37 + * 1.38 + * A single ID, which is a basic ID plus optional filter and optional 1.39 + * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and 1.40 + * "Lower (Upper)". 1.41 + * 1.42 + * A compound ID, which is a sequence of one or more single IDs, 1.43 + * separated by semicolons, with optional forward and reverse global 1.44 + * filters. The global filters are UnicodeSet patterns prepended or 1.45 + * appended to the IDs, separated by semicolons. An appended filter 1.46 + * must be enclosed in parentheses and applies in the reverse 1.47 + * direction. 1.48 + * 1.49 + * @author Alan Liu 1.50 + */ 1.51 +class TransliteratorIDParser /* not : public UObject because all methods are static */ { 1.52 + 1.53 + public: 1.54 + 1.55 + /** 1.56 + * A structure containing the parsed data of a filtered ID, that 1.57 + * is, a basic ID optionally with a filter. 1.58 + * 1.59 + * 'source' and 'target' will always be non-null. The 'variant' 1.60 + * will be non-null only if a non-empty variant was parsed. 1.61 + * 1.62 + * 'sawSource' is true if there was an explicit source in the 1.63 + * parsed id. If there was no explicit source, then an implied 1.64 + * source of ANY is returned and 'sawSource' is set to false. 1.65 + * 1.66 + * 'filter' is the parsed filter pattern, or null if there was no 1.67 + * filter. 1.68 + */ 1.69 + class Specs : public UMemory { 1.70 + public: 1.71 + UnicodeString source; // not null 1.72 + UnicodeString target; // not null 1.73 + UnicodeString variant; // may be null 1.74 + UnicodeString filter; // may be null 1.75 + UBool sawSource; 1.76 + Specs(const UnicodeString& s, const UnicodeString& t, 1.77 + const UnicodeString& v, UBool sawS, 1.78 + const UnicodeString& f); 1.79 + 1.80 + private: 1.81 + 1.82 + Specs(const Specs &other); // forbid copying of this class 1.83 + Specs &operator=(const Specs &other); // forbid copying of this class 1.84 + }; 1.85 + 1.86 + /** 1.87 + * A structure containing the canonicalized data of a filtered ID, 1.88 + * that is, a basic ID optionally with a filter. 1.89 + * 1.90 + * 'canonID' is always non-null. It may be the empty string "". 1.91 + * It is the id that should be assigned to the created 1.92 + * transliterator. It _cannot_ be instantiated directly. 1.93 + * 1.94 + * 'basicID' is always non-null and non-empty. It is always of 1.95 + * the form S-T or S-T/V. It is designed to be fed to low-level 1.96 + * instantiation code that only understands these two formats. 1.97 + * 1.98 + * 'filter' may be null, if there is none, or non-null and 1.99 + * non-empty. 1.100 + */ 1.101 + class SingleID : public UMemory { 1.102 + public: 1.103 + UnicodeString canonID; 1.104 + UnicodeString basicID; 1.105 + UnicodeString filter; 1.106 + SingleID(const UnicodeString& c, const UnicodeString& b, 1.107 + const UnicodeString& f); 1.108 + SingleID(const UnicodeString& c, const UnicodeString& b); 1.109 + Transliterator* createInstance(); 1.110 + 1.111 + private: 1.112 + 1.113 + SingleID(const SingleID &other); // forbid copying of this class 1.114 + SingleID &operator=(const SingleID &other); // forbid copying of this class 1.115 + }; 1.116 + 1.117 + /** 1.118 + * Parse a filter ID, that is, an ID of the general form 1.119 + * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 1.120 + * @param id the id to be parsed 1.121 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.122 + * the first character to parse. On output, the position after 1.123 + * the last character parsed. 1.124 + * @return a SingleID object or null if the parse fails 1.125 + */ 1.126 + static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos); 1.127 + 1.128 + /** 1.129 + * Parse a single ID, that is, an ID of the general form 1.130 + * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 1.131 + * optional, the filters optional, and the variants optional. 1.132 + * @param id the id to be parsed 1.133 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.134 + * the first character to parse. On output, the position after 1.135 + * the last character parsed. 1.136 + * @param dir the direction. If the direction is REVERSE then the 1.137 + * SingleID is constructed for the reverse direction. 1.138 + * @return a SingleID object or null 1.139 + */ 1.140 + static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos, 1.141 + int32_t dir, UErrorCode& status); 1.142 + 1.143 + /** 1.144 + * Parse a global filter of the form "[f]" or "([f])", depending 1.145 + * on 'withParens'. 1.146 + * @param id the pattern the parse 1.147 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.148 + * the first character to parse. On output, the position after 1.149 + * the last character parsed. 1.150 + * @param dir the direction. 1.151 + * @param withParens INPUT-OUTPUT parameter. On entry, if 1.152 + * withParens[0] is 0, then parens are disallowed. If it is 1, 1.153 + * then parens are required. If it is -1, then parens are 1.154 + * optional, and the return result will be set to 0 or 1. 1.155 + * @param canonID OUTPUT parameter. The pattern for the filter 1.156 + * added to the canonID, either at the end, if dir is FORWARD, or 1.157 + * at the start, if dir is REVERSE. The pattern will be enclosed 1.158 + * in parentheses if appropriate, and will be suffixed with an 1.159 + * ID_DELIM character. May be null. 1.160 + * @return a UnicodeSet object or null. A non-null results 1.161 + * indicates a successful parse, regardless of whether the filter 1.162 + * applies to the given direction. The caller should discard it 1.163 + * if withParens != (dir == REVERSE). 1.164 + */ 1.165 + static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos, 1.166 + int32_t dir, 1.167 + int32_t& withParens, 1.168 + UnicodeString* canonID); 1.169 + 1.170 + /** 1.171 + * Parse a compound ID, consisting of an optional forward global 1.172 + * filter, a separator, one or more single IDs delimited by 1.173 + * separators, an an optional reverse global filter. The 1.174 + * separator is a semicolon. The global filters are UnicodeSet 1.175 + * patterns. The reverse global filter must be enclosed in 1.176 + * parentheses. 1.177 + * @param id the pattern the parse 1.178 + * @param dir the direction. 1.179 + * @param canonID OUTPUT parameter that receives the canonical ID, 1.180 + * consisting of canonical IDs for all elements, as returned by 1.181 + * parseSingleID(), separated by semicolons. Previous contents 1.182 + * are discarded. 1.183 + * @param list OUTPUT parameter that receives a list of SingleID 1.184 + * objects representing the parsed IDs. Previous contents are 1.185 + * discarded. 1.186 + * @param globalFilter OUTPUT parameter that receives a pointer to 1.187 + * a newly created global filter for this ID in this direction, or 1.188 + * null if there is none. 1.189 + * @return true if the parse succeeds, that is, if the entire 1.190 + * id is consumed without syntax error. 1.191 + */ 1.192 + static UBool parseCompoundID(const UnicodeString& id, int32_t dir, 1.193 + UnicodeString& canonID, 1.194 + UVector& list, 1.195 + UnicodeSet*& globalFilter); 1.196 + 1.197 + /** 1.198 + * Convert the elements of the 'list' vector, which are SingleID 1.199 + * objects, into actual Transliterator objects. In the course of 1.200 + * this, some (or all) entries may be removed. If all entries 1.201 + * are removed, the Null transliterator will be added. 1.202 + * 1.203 + * Delete entries with empty basicIDs; these are generated by 1.204 + * elements like "(A)" in the forward direction, or "A()" in 1.205 + * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 1.206 + * SingleID entries to actual transliterators. 1.207 + * 1.208 + * @param list vector of SingleID objects. On exit, vector 1.209 + * of one or more Transliterators. 1.210 + * @param ec Output param to receive a success or an error code. 1.211 + * @return new value of insertIndex. The index will shift if 1.212 + * there are empty items, like "(Lower)", with indices less than 1.213 + * insertIndex. 1.214 + */ 1.215 + static void instantiateList(UVector& list, 1.216 + UErrorCode& ec); 1.217 + 1.218 + /** 1.219 + * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 1.220 + * S-T/V, or S/V-T. If the source is missing, return a source of 1.221 + * ANY. 1.222 + * @param id the id string, in any of several forms 1.223 + * @param source the given source. 1.224 + * @param target the given target. 1.225 + * @param variant the given variant 1.226 + * @param isSourcePresent If TRUE then the source is present. 1.227 + * If the source is not present, ANY will be 1.228 + * given as the source, and isSourcePresent will be null 1.229 + * @return an array of 4 strings: source, target, variant, and 1.230 + * isSourcePresent. If the source is not present, ANY will be 1.231 + * given as the source, and isSourcePresent will be null. Otherwise 1.232 + * isSourcePresent will be non-null. The target may be empty if the 1.233 + * id is not well-formed. The variant may be empty. 1.234 + */ 1.235 + static void IDtoSTV(const UnicodeString& id, 1.236 + UnicodeString& source, 1.237 + UnicodeString& target, 1.238 + UnicodeString& variant, 1.239 + UBool& isSourcePresent); 1.240 + 1.241 + /** 1.242 + * Given source, target, and variant strings, concatenate them into a 1.243 + * full ID. If the source is empty, then "Any" will be used for the 1.244 + * source, so the ID will always be of the form s-t/v or s-t. 1.245 + */ 1.246 + static void STVtoID(const UnicodeString& source, 1.247 + const UnicodeString& target, 1.248 + const UnicodeString& variant, 1.249 + UnicodeString& id); 1.250 + 1.251 + /** 1.252 + * Register two targets as being inverses of one another. For 1.253 + * example, calling registerSpecialInverse("NFC", "NFD", true) causes 1.254 + * Transliterator to form the following inverse relationships: 1.255 + * 1.256 + * <pre>NFC => NFD 1.257 + * Any-NFC => Any-NFD 1.258 + * NFD => NFC 1.259 + * Any-NFD => Any-NFC</pre> 1.260 + * 1.261 + * (Without the special inverse registration, the inverse of NFC 1.262 + * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 1.263 + * that the presence or absence of "Any-" is preserved. 1.264 + * 1.265 + * <p>The relationship is symmetrical; registering (a, b) is 1.266 + * equivalent to registering (b, a). 1.267 + * 1.268 + * <p>The relevant IDs must still be registered separately as 1.269 + * factories or classes. 1.270 + * 1.271 + * <p>Only the targets are specified. Special inverses always 1.272 + * have the form Any-Target1 <=> Any-Target2. The target should 1.273 + * have canonical casing (the casing desired to be produced when 1.274 + * an inverse is formed) and should contain no whitespace or other 1.275 + * extraneous characters. 1.276 + * 1.277 + * @param target the target against which to register the inverse 1.278 + * @param inverseTarget the inverse of target, that is 1.279 + * Any-target.getInverse() => Any-inverseTarget 1.280 + * @param bidirectional if true, register the reverse relation 1.281 + * as well, that is, Any-inverseTarget.getInverse() => Any-target 1.282 + */ 1.283 + static void registerSpecialInverse(const UnicodeString& target, 1.284 + const UnicodeString& inverseTarget, 1.285 + UBool bidirectional, 1.286 + UErrorCode &status); 1.287 + 1.288 + /** 1.289 + * Free static memory. 1.290 + */ 1.291 + static void cleanup(); 1.292 + 1.293 + private: 1.294 + //---------------------------------------------------------------- 1.295 + // Private implementation 1.296 + //---------------------------------------------------------------- 1.297 + 1.298 + // forbid instantiation 1.299 + TransliteratorIDParser(); 1.300 + 1.301 + /** 1.302 + * Parse an ID into component pieces. Take IDs of the form T, 1.303 + * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 1.304 + * source of ANY. 1.305 + * @param id the id string, in any of several forms 1.306 + * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1.307 + * offset of the first character to parse in id. On output, 1.308 + * pos[0] is the offset after the last parsed character. If the 1.309 + * parse failed, pos[0] will be unchanged. 1.310 + * @param allowFilter if true, a UnicodeSet pattern is allowed 1.311 + * at any location between specs or delimiters, and is returned 1.312 + * as the fifth string in the array. 1.313 + * @return a Specs object, or null if the parse failed. If 1.314 + * neither source nor target was seen in the parsed id, then the 1.315 + * parse fails. If allowFilter is true, then the parsed filter 1.316 + * pattern is returned in the Specs object, otherwise the returned 1.317 + * filter reference is null. If the parse fails for any reason 1.318 + * null is returned. 1.319 + */ 1.320 + static Specs* parseFilterID(const UnicodeString& id, int32_t& pos, 1.321 + UBool allowFilter); 1.322 + 1.323 + /** 1.324 + * Givens a Specs object, convert it to a SingleID object. The 1.325 + * Spec object is a more unprocessed parse result. The SingleID 1.326 + * object contains information about canonical and basic IDs. 1.327 + * @param specs the given Specs object. 1.328 + * @param dir either FORWARD or REVERSE. 1.329 + * @return a SingleID; never returns null. Returned object always 1.330 + * has 'filter' field of null. 1.331 + */ 1.332 + static SingleID* specsToID(const Specs* specs, int32_t dir); 1.333 + 1.334 + /** 1.335 + * Given a Specs object, return a SingleID representing the 1.336 + * special inverse of that ID. If there is no special inverse 1.337 + * then return null. 1.338 + * @param specs the given Specs. 1.339 + * @return a SingleID or null. Returned object always has 1.340 + * 'filter' field of null. 1.341 + */ 1.342 + static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status); 1.343 + 1.344 + /** 1.345 + * Glue method to get around access problems in C++. 1.346 + * @param id the id string for the transliterator, in any of several forms 1.347 + * @param canonID the given canonical ID 1.348 + */ 1.349 + static Transliterator* createBasicInstance(const UnicodeString& id, 1.350 + const UnicodeString* canonID); 1.351 + 1.352 + /** 1.353 + * Initialize static memory. 1.354 + */ 1.355 + static void init(UErrorCode &status); 1.356 + 1.357 + friend class SingleID; 1.358 +}; 1.359 + 1.360 +U_NAMESPACE_END 1.361 + 1.362 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.363 + 1.364 +#endif