intl/icu/source/i18n/rbt.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 1999-2007, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   Date        Name        Description
     7 *   11/17/99    aliu        Creation.
     8 **********************************************************************
     9 */
    10 #ifndef RBT_H
    11 #define RBT_H
    13 #include "unicode/utypes.h"
    15 #if !UCONFIG_NO_TRANSLITERATION
    17 #include "unicode/translit.h"
    18 #include "unicode/utypes.h"
    19 #include "unicode/parseerr.h"
    20 #include "unicode/udata.h"
    22 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit"
    24 U_NAMESPACE_BEGIN
    26 class TransliterationRuleData;
    28 /**
    29  * <code>RuleBasedTransliterator</code> is a transliterator
    30  * that reads a set of rules in order to determine how to perform
    31  * translations. Rule sets are stored in resource bundles indexed by
    32  * name. Rules within a rule set are separated by semicolons (';').
    33  * To include a literal semicolon, prefix it with a backslash ('\').
    34  * Whitespace, as defined by <code>Character.isWhitespace()</code>,
    35  * is ignored. If the first non-blank character on a line is '#',
    36  * the entire line is ignored as a comment. </p>
    37  * 
    38  * <p>Each set of rules consists of two groups, one forward, and one
    39  * reverse. This is a convention that is not enforced; rules for one
    40  * direction may be omitted, with the result that translations in
    41  * that direction will not modify the source text. In addition,
    42  * bidirectional forward-reverse rules may be specified for
    43  * symmetrical transformations.</p>
    44  * 
    45  * <p><b>Rule syntax</b> </p>
    46  * 
    47  * <p>Rule statements take one of the following forms: </p>
    48  * 
    49  * <dl>
    50  *     <dt><code>$alefmadda=\u0622;</code></dt>
    51  *     <dd><strong>Variable definition.</strong> The name on the
    52  *         left is assigned the text on the right. In this example,
    53  *         after this statement, instances of the left hand name,
    54  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
    55  *         the Unicode character U+0622. Variable names must begin
    56  *         with a letter and consist only of letters, digits, and
    57  *         underscores. Case is significant. Duplicate names cause
    58  *         an exception to be thrown, that is, variables cannot be
    59  *         redefined. The right hand side may contain well-formed
    60  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
    61  *         The right hand side may contain embedded <code>UnicodeSet</code>
    62  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
    63  *     <dd>&nbsp;</dd>
    64  *     <dt><code>ai&gt;$alefmadda;</code></dt>
    65  *     <dd><strong>Forward translation rule.</strong> This rule
    66  *         states that the string on the left will be changed to the
    67  *         string on the right when performing forward
    68  *         transliteration.</dd>
    69  *     <dt>&nbsp;</dt>
    70  *     <dt><code>ai<$alefmadda;</code></dt>
    71  *     <dd><strong>Reverse translation rule.</strong> This rule
    72  *         states that the string on the right will be changed to
    73  *         the string on the left when performing reverse
    74  *         transliteration.</dd>
    75  * </dl>
    76  * 
    77  * <dl>
    78  *     <dt><code>ai<>$alefmadda;</code></dt>
    79  *     <dd><strong>Bidirectional translation rule.</strong> This
    80  *         rule states that the string on the right will be changed
    81  *         to the string on the left when performing forward
    82  *         transliteration, and vice versa when performing reverse
    83  *         transliteration.</dd>
    84  * </dl>
    85  * 
    86  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
    87  * string</em>. The match pattern consists of literal characters,
    88  * optionally preceded by context, and optionally followed by
    89  * context. Context characters, like literal pattern characters,
    90  * must be matched in the text being transliterated. However, unlike
    91  * literal pattern characters, they are not replaced by the output
    92  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
    93  * indicates the characters &quot;<code>def</code>&quot; must be
    94  * preceded by &quot;<code>abc</code>&quot; for a successful match.
    95  * If there is a successful match, &quot;<code>def</code>&quot; will
    96  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
    97  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
    98  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
    99  * (or &quot;<code>123}456</code>&quot;) in which the literal
   100  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
   101  * </p>
   102  * 
   103  * <p>The output string of a forward or reverse rule consists of
   104  * characters to replace the literal pattern characters. If the
   105  * output string contains the character '<code>|</code>', this is
   106  * taken to indicate the location of the <em>cursor</em> after
   107  * replacement. The cursor is the point in the text at which the
   108  * next replacement, if any, will be applied. The cursor is usually
   109  * placed within the replacement text; however, it can actually be
   110  * placed into the precending or following context by using the
   111  * special character '<code>@</code>'. Examples:</p>
   112  * 
   113  * <blockquote>
   114  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
   115  *     before a<br>
   116  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
   117  *     y and z</code></p>
   118  * </blockquote>
   119  * 
   120  * <p><b>UnicodeSet</b></p>
   121  * 
   122  * <p><code>UnicodeSet</code> patterns may appear anywhere that
   123  * makes sense. They may appear in variable definitions.
   124  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
   125  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
   126  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
   127  * 
   128  * <p><code>UnicodeSet</code> patterns may also be embedded directly
   129  * into rule strings. Thus, the following two rules are equivalent:</p>
   130  * 
   131  * <blockquote>
   132  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
   133  *     [aeiou]&gt;'*';
   134  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
   135  *     Another way</code></p>
   136  * </blockquote>
   137  * 
   138  * <p>See {@link UnicodeSet} for more documentation and examples.</p>
   139  * 
   140  * <p><b>Segments</b></p>
   141  * 
   142  * <p>Segments of the input string can be matched and copied to the
   143  * output string. This makes certain sets of rules simpler and more
   144  * general, and makes reordering possible. For example:</p>
   145  * 
   146  * <blockquote>
   147  *     <p><code>([a-z]) &gt; $1 $1;
   148  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
   149  *     double lowercase letters<br>
   150  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
   151  * </blockquote>
   152  * 
   153  * <p>The segment of the input string to be copied is delimited by
   154  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
   155  * nine segments may be defined. Segments may not overlap. In the
   156  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
   157  * represent the input string segments, in left-to-right order of
   158  * definition.</p>
   159  * 
   160  * <p><b>Anchors</b></p>
   161  * 
   162  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
   163  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
   164  * 
   165  * <blockquote>
   166  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
   167  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
   168  *   of 'a'<br>
   169  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>
   170  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
   171  *   of 'z'</code></p>
   172  * </blockquote>
   173  * 
   174  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
   175  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
   176  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
   177  * match either the beginning or the end of the text, depending on its placement. For
   178  * example:</p>
   179  * 
   180  * <blockquote>
   181  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
   182  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
   183  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
   184  * </blockquote>
   185  * 
   186  * <p><b>Example</b> </p>
   187  * 
   188  * <p>The following example rules illustrate many of the features of
   189  * the rule language. </p>
   190  * 
   191  * <table border="0" cellpadding="4">
   192  *     <tr>
   193  *         <td valign="top">Rule 1.</td>
   194  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>
   195  *     </tr>
   196  *     <tr>
   197  *         <td valign="top">Rule 2.</td>
   198  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>
   199  *     </tr>
   200  *     <tr>
   201  *         <td valign="top">Rule 3.</td>
   202  *         <td valign="top" nowrap><code>yz&gt;q</code></td>
   203  *     </tr>
   204  * </table>
   205  * 
   206  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
   207  * yields the following results: </p>
   208  * 
   209  * <table border="0" cellpadding="4">
   210  *     <tr>
   211  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>
   212  *         <td valign="top">Initial state, no rules match. Advance
   213  *         cursor.</td>
   214  *     </tr>
   215  *     <tr>
   216  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>
   217  *         <td valign="top">Still no match. Rule 1 does not match
   218  *         because the preceding context is not present.</td>
   219  *     </tr>
   220  *     <tr>
   221  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>
   222  *         <td valign="top">Still no match. Keep advancing until
   223  *         there is a match...</td>
   224  *     </tr>
   225  *     <tr>
   226  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>
   227  *         <td valign="top">...</td>
   228  *     </tr>
   229  *     <tr>
   230  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>
   231  *         <td valign="top">...</td>
   232  *     </tr>
   233  *     <tr>
   234  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>
   235  *         <td valign="top">...</td>
   236  *     </tr>
   237  *     <tr>
   238  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>
   239  *         <td valign="top">...</td>
   240  *     </tr>
   241  *     <tr>
   242  *         <td valign="top" nowrap><code>adefabc|defz</code></td>
   243  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;
   244  *         with &quot;<code>xy</code>&quot; and back up the cursor
   245  *         to before the '<code>y</code>'.</td>
   246  *     </tr>
   247  *     <tr>
   248  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>
   249  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is
   250  *         present, rule 2 does not match because the cursor is
   251  *         before the '<code>y</code>', not before the '<code>x</code>'.
   252  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
   253  *         with &quot;<code>q</code>&quot;.</td>
   254  *     </tr>
   255  *     <tr>
   256  *         <td valign="top" nowrap><code>adefabcxq|</code></td>
   257  *         <td valign="top">The cursor is at the end;
   258  *         transliteration is complete.</td>
   259  *     </tr>
   260  * </table>
   261  * 
   262  * <p>The order of rules is significant. If multiple rules may match
   263  * at some point, the first matching rule is applied. </p>
   264  * 
   265  * <p>Forward and reverse rules may have an empty output string.
   266  * Otherwise, an empty left or right hand side of any statement is a
   267  * syntax error. </p>
   268  * 
   269  * <p>Single quotes are used to quote any character other than a
   270  * digit or letter. To specify a single quote itself, inside or
   271  * outside of quotes, use two single quotes in a row. For example,
   272  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
   273  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
   274  * </p>
   275  * 
   276  * <p><b>Notes</b> </p>
   277  * 
   278  * <p>While a RuleBasedTransliterator is being built, it checks that
   279  * the rules are added in proper order. For example, if the rule
   280  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
   281  * then the second rule will throw an exception. The reason is that
   282  * the second rule can never be triggered, since the first rule
   283  * always matches anything it matches. In other words, the first
   284  * rule <em>masks</em> the second rule. </p>
   285  * 
   286  * @author Alan Liu
   287  * @internal Use transliterator factory methods instead since this class will be removed in that release.
   288  */
   289 class RuleBasedTransliterator : public Transliterator {
   290 private:
   291     /**
   292      * The data object is immutable, so we can freely share it with
   293      * other instances of RBT, as long as we do NOT own this object.
   294      *  TODO:  data is no longer immutable.  See bugs #1866, 2155
   295      */
   296     TransliterationRuleData* fData;
   298     /**
   299      * If true, we own the data object and must delete it.
   300      */
   301     UBool isDataOwned;
   303 public:
   305     /**
   306      * Constructs a new transliterator from the given rules.
   307      * @param rules rules, separated by ';'
   308      * @param direction either FORWARD or REVERSE.
   309      * @exception IllegalArgumentException if rules are malformed.
   310      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   311      */
   312     RuleBasedTransliterator(const UnicodeString& id,
   313                             const UnicodeString& rules,
   314                             UTransDirection direction,
   315                             UnicodeFilter* adoptedFilter,
   316                             UParseError& parseError,
   317                             UErrorCode& status);
   319     /**
   320      * Constructs a new transliterator from the given rules.
   321      * @param rules rules, separated by ';'
   322      * @param direction either FORWARD or REVERSE.
   323      * @exception IllegalArgumentException if rules are malformed.
   324      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   325      */
   326     /*RuleBasedTransliterator(const UnicodeString& id,
   327                             const UnicodeString& rules,
   328                             UTransDirection direction,
   329                             UnicodeFilter* adoptedFilter,
   330                             UErrorCode& status);*/
   332     /**
   333      * Covenience constructor with no filter.
   334      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   335      */
   336     /*RuleBasedTransliterator(const UnicodeString& id,
   337                             const UnicodeString& rules,
   338                             UTransDirection direction,
   339                             UErrorCode& status);*/
   341     /**
   342      * Covenience constructor with no filter and FORWARD direction.
   343      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   344      */
   345     /*RuleBasedTransliterator(const UnicodeString& id,
   346                             const UnicodeString& rules,
   347                             UErrorCode& status);*/
   349     /**
   350      * Covenience constructor with FORWARD direction.
   351      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   352      */
   353     /*RuleBasedTransliterator(const UnicodeString& id,
   354                             const UnicodeString& rules,
   355                             UnicodeFilter* adoptedFilter,
   356                             UErrorCode& status);*/
   357 private:
   359      friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor
   360     /**
   361      * Covenience constructor.
   362      * @param id            the id for the transliterator.
   363      * @param theData       the rule data for the transliterator.
   364      * @param adoptedFilter the filter for the transliterator
   365      */
   366     RuleBasedTransliterator(const UnicodeString& id,
   367                             const TransliterationRuleData* theData,
   368                             UnicodeFilter* adoptedFilter = 0);
   371     friend class Transliterator; // to access following ct
   373     /**
   374      * Internal constructor.
   375      * @param id            the id for the transliterator.
   376      * @param theData       the rule data for the transliterator.
   377      * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.
   378      */
   379     RuleBasedTransliterator(const UnicodeString& id,
   380                             TransliterationRuleData* data,
   381                             UBool isDataAdopted);
   383 public:
   385     /**
   386      * Copy constructor.
   387      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   388      */
   389     RuleBasedTransliterator(const RuleBasedTransliterator&);
   391     virtual ~RuleBasedTransliterator();
   393     /**
   394      * Implement Transliterator API.
   395      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   396      */
   397     virtual Transliterator* clone(void) const;
   399 protected:
   400     /**
   401      * Implements {@link Transliterator#handleTransliterate}.
   402      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   403      */
   404     virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
   405                                      UBool isIncremental) const;
   407 public:
   408     /**
   409      * Return a representation of this transliterator as source rules.
   410      * These rules will produce an equivalent transliterator if used
   411      * to construct a new transliterator.
   412      * @param result the string to receive the rules.  Previous
   413      * contents will be deleted.
   414      * @param escapeUnprintable if TRUE then convert unprintable
   415      * character to their hex escape representations, \uxxxx or
   416      * \Uxxxxxxxx.  Unprintable characters are those other than
   417      * U+000A, U+0020..U+007E.
   418      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   419      */
   420     virtual UnicodeString& toRules(UnicodeString& result,
   421                                    UBool escapeUnprintable) const;
   423 protected:
   424     /**
   425      * Implement Transliterator framework
   426      */
   427     virtual void handleGetSourceSet(UnicodeSet& result) const;
   429 public:
   430     /**
   431      * Override Transliterator framework
   432      */
   433     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
   435     /**
   436      * Return the class ID for this class.  This is useful only for
   437      * comparing to a return value from getDynamicClassID().  For example:
   438      * <pre>
   439      * .      Base* polymorphic_pointer = createPolymorphicObject();
   440      * .      if (polymorphic_pointer->getDynamicClassID() ==
   441      * .          Derived::getStaticClassID()) ...
   442      * </pre>
   443      * @return          The class ID for all objects of this class.
   444      * @internal Use transliterator factory methods instead since this class will be removed in that release.
   445      */
   446     U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void);
   448     /**
   449      * Returns a unique class ID <b>polymorphically</b>.  This method
   450      * is to implement a simple version of RTTI, since not all C++
   451      * compilers support genuine RTTI.  Polymorphic operator==() and
   452      * clone() methods call this method.
   453      * 
   454      * @return The class ID for this object. All objects of a given
   455      * class have the same class ID.  Objects of other classes have
   456      * different class IDs.
   457      */
   458     virtual UClassID getDynamicClassID(void) const;
   460 private:
   462     void _construct(const UnicodeString& rules,
   463                     UTransDirection direction,
   464                     UParseError& parseError,
   465                     UErrorCode& status);
   466 };
   469 U_NAMESPACE_END
   471 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   473 #endif

mercurial