The Tor Browser: intl/icu/source/i18n/rbt.h@fc2d59ddac77

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 1999-2007, International Business Machines

     4 *   Corporation and others.  All Rights Reserved.

     5 **********************************************************************

     6 *   Date        Name        Description

     7 *   11/17/99    aliu        Creation.

     8 **********************************************************************

     9 */

    10 #ifndef RBT_H

    11 #define RBT_H

    13 #include "unicode/utypes.h"

    15 #if !UCONFIG_NO_TRANSLITERATION

    17 #include "unicode/translit.h"

    18 #include "unicode/utypes.h"

    19 #include "unicode/parseerr.h"

    20 #include "unicode/udata.h"

    22 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit"

    24 U_NAMESPACE_BEGIN

    26 class TransliterationRuleData;

    28 /**

    29  * <code>RuleBasedTransliterator</code> is a transliterator

    30  * that reads a set of rules in order to determine how to perform

    31  * translations. Rule sets are stored in resource bundles indexed by

    32  * name. Rules within a rule set are separated by semicolons (';').

    33  * To include a literal semicolon, prefix it with a backslash ('\').

    34  * Whitespace, as defined by <code>Character.isWhitespace()</code>,

    35  * is ignored. If the first non-blank character on a line is '#',

    36  * the entire line is ignored as a comment. </p>

    37  *

    38  * <p>Each set of rules consists of two groups, one forward, and one

    39  * reverse. This is a convention that is not enforced; rules for one

    40  * direction may be omitted, with the result that translations in

    41  * that direction will not modify the source text. In addition,

    42  * bidirectional forward-reverse rules may be specified for

    43  * symmetrical transformations.</p>

    44  *

    45  * <p><b>Rule syntax</b> </p>

    46  *

    47  * <p>Rule statements take one of the following forms: </p>

    48  *

    49  * <dl>

    50  *     <dt><code>$alefmadda=\u0622;</code></dt>

    51  *     <dd><strong>Variable definition.</strong> The name on the

    52  *         left is assigned the text on the right. In this example,

    53  *         after this statement, instances of the left hand name,

    54  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by

    55  *         the Unicode character U+0622. Variable names must begin

    56  *         with a letter and consist only of letters, digits, and

    57  *         underscores. Case is significant. Duplicate names cause

    58  *         an exception to be thrown, that is, variables cannot be

    59  *         redefined. The right hand side may contain well-formed

    60  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).

    61  *         The right hand side may contain embedded <code>UnicodeSet</code>

    62  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>

    63  *     <dd>&nbsp;</dd>

    64  *     <dt><code>ai&gt;$alefmadda;</code></dt>

    65  *     <dd><strong>Forward translation rule.</strong> This rule

    66  *         states that the string on the left will be changed to the

    67  *         string on the right when performing forward

    68  *         transliteration.</dd>

    69  *     <dt>&nbsp;</dt>

    70  *     <dt><code>ai<$alefmadda;</code></dt>

    71  *     <dd><strong>Reverse translation rule.</strong> This rule

    72  *         states that the string on the right will be changed to

    73  *         the string on the left when performing reverse

    74  *         transliteration.</dd>

    75  * </dl>

    76  *

    77  * <dl>

    78  *     <dt><code>ai<>$alefmadda;</code></dt>

    79  *     <dd><strong>Bidirectional translation rule.</strong> This

    80  *         rule states that the string on the right will be changed

    81  *         to the string on the left when performing forward

    82  *         transliteration, and vice versa when performing reverse

    83  *         transliteration.</dd>

    84  * </dl>

    85  *

    86  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output

    87  * string</em>. The match pattern consists of literal characters,

    88  * optionally preceded by context, and optionally followed by

    89  * context. Context characters, like literal pattern characters,

    90  * must be matched in the text being transliterated. However, unlike

    91  * literal pattern characters, they are not replaced by the output

    92  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;

    93  * indicates the characters &quot;<code>def</code>&quot; must be

    94  * preceded by &quot;<code>abc</code>&quot; for a successful match.

    95  * If there is a successful match, &quot;<code>def</code>&quot; will

    96  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'

    97  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to

    98  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;

    99  * (or &quot;<code>123}456</code>&quot;) in which the literal

   100  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.

   101  * </p>

   102  *

   103  * <p>The output string of a forward or reverse rule consists of

   104  * characters to replace the literal pattern characters. If the

   105  * output string contains the character '<code>|</code>', this is

   106  * taken to indicate the location of the <em>cursor</em> after

   107  * replacement. The cursor is the point in the text at which the

   108  * next replacement, if any, will be applied. The cursor is usually

   109  * placed within the replacement text; however, it can actually be

   110  * placed into the precending or following context by using the

   111  * special character '<code>@</code>'. Examples:</p>

   112  *

   113  * <blockquote>

   114  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor

   115  *     before a<br>

   116  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between

   117  *     y and z</code></p>

   118  * </blockquote>

   119  *

   120  * <p><b>UnicodeSet</b></p>

   121  *

   122  * <p><code>UnicodeSet</code> patterns may appear anywhere that

   123  * makes sense. They may appear in variable definitions.

   124  * Contrariwise, <code>UnicodeSet</code> patterns may themselves

   125  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,

   126  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>

   127  *

   128  * <p><code>UnicodeSet</code> patterns may also be embedded directly

   129  * into rule strings. Thus, the following two rules are equivalent:</p>

   130  *

   131  * <blockquote>

   132  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>

   133  *     [aeiou]&gt;'*';

   134  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#

   135  *     Another way</code></p>

   136  * </blockquote>

   137  *

   138  * <p>See {@link UnicodeSet} for more documentation and examples.</p>

   139  *

   140  * <p><b>Segments</b></p>

   141  *

   142  * <p>Segments of the input string can be matched and copied to the

   143  * output string. This makes certain sets of rules simpler and more

   144  * general, and makes reordering possible. For example:</p>

   145  *

   146  * <blockquote>

   147  *     <p><code>([a-z]) &gt; $1 $1;

   148  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#

   149  *     double lowercase letters<br>

   150  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>

   151  * </blockquote>

   152  *

   153  * <p>The segment of the input string to be copied is delimited by

   154  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to

   155  * nine segments may be defined. Segments may not overlap. In the

   156  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;

   157  * represent the input string segments, in left-to-right order of

   158  * definition.</p>

   159  *

   160  * <p><b>Anchors</b></p>

   161  *

   162  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the

   163  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>

   164  *

   165  * <blockquote>

   166  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>

   167  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances

   168  *   of 'a'<br>

   169  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>

   170  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances

   171  *   of 'z'</code></p>

   172  * </blockquote>

   173  *

   174  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.

   175  * This is done by including a virtual anchor character '<code>$</code>' at the end of the

   176  * set pattern. Although this is usually the match chafacter for the end anchor, the set will

   177  * match either the beginning or the end of the text, depending on its placement. For

   178  * example:</p>

   179  *

   180  * <blockquote>

   181  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>

   182  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>

   183  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>

   184  * </blockquote>

   185  *

   186  * <p><b>Example</b> </p>

   187  *

   188  * <p>The following example rules illustrate many of the features of

   189  * the rule language. </p>

   190  *

   191  * <table border="0" cellpadding="4">

   192  *     <tr>

   193  *         <td valign="top">Rule 1.</td>

   194  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>

   195  *     </tr>

   196  *     <tr>

   197  *         <td valign="top">Rule 2.</td>

   198  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>

   199  *     </tr>

   200  *     <tr>

   201  *         <td valign="top">Rule 3.</td>

   202  *         <td valign="top" nowrap><code>yz&gt;q</code></td>

   203  *     </tr>

   204  * </table>

   205  *

   206  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;

   207  * yields the following results: </p>

   208  *

   209  * <table border="0" cellpadding="4">

   210  *     <tr>

   211  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>

   212  *         <td valign="top">Initial state, no rules match. Advance

   213  *         cursor.</td>

   214  *     </tr>

   215  *     <tr>

   216  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>

   217  *         <td valign="top">Still no match. Rule 1 does not match

   218  *         because the preceding context is not present.</td>

   219  *     </tr>

   220  *     <tr>

   221  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>

   222  *         <td valign="top">Still no match. Keep advancing until

   223  *         there is a match...</td>

   224  *     </tr>

   225  *     <tr>

   226  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>

   227  *         <td valign="top">...</td>

   228  *     </tr>

   229  *     <tr>

   230  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>

   231  *         <td valign="top">...</td>

   232  *     </tr>

   233  *     <tr>

   234  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>

   235  *         <td valign="top">...</td>

   236  *     </tr>

   237  *     <tr>

   238  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>

   239  *         <td valign="top">...</td>

   240  *     </tr>

   241  *     <tr>

   242  *         <td valign="top" nowrap><code>adefabc|defz</code></td>

   243  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;

   244  *         with &quot;<code>xy</code>&quot; and back up the cursor

   245  *         to before the '<code>y</code>'.</td>

   246  *     </tr>

   247  *     <tr>

   248  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>

   249  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is

   250  *         present, rule 2 does not match because the cursor is

   251  *         before the '<code>y</code>', not before the '<code>x</code>'.

   252  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;

   253  *         with &quot;<code>q</code>&quot;.</td>

   254  *     </tr>

   255  *     <tr>

   256  *         <td valign="top" nowrap><code>adefabcxq|</code></td>

   257  *         <td valign="top">The cursor is at the end;

   258  *         transliteration is complete.</td>

   259  *     </tr>

   260  * </table>

   261  *

   262  * <p>The order of rules is significant. If multiple rules may match

   263  * at some point, the first matching rule is applied. </p>

   264  *

   265  * <p>Forward and reverse rules may have an empty output string.

   266  * Otherwise, an empty left or right hand side of any statement is a

   267  * syntax error. </p>

   268  *

   269  * <p>Single quotes are used to quote any character other than a

   270  * digit or letter. To specify a single quote itself, inside or

   271  * outside of quotes, use two single quotes in a row. For example,

   272  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the

   273  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.

   274  * </p>

   275  *

   276  * <p><b>Notes</b> </p>

   277  *

   278  * <p>While a RuleBasedTransliterator is being built, it checks that

   279  * the rules are added in proper order. For example, if the rule

   280  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,

   281  * then the second rule will throw an exception. The reason is that

   282  * the second rule can never be triggered, since the first rule

   283  * always matches anything it matches. In other words, the first

   284  * rule <em>masks</em> the second rule. </p>

   285  *

   286  * @author Alan Liu

   287  * @internal Use transliterator factory methods instead since this class will be removed in that release.

   288  */

   289 class RuleBasedTransliterator : public Transliterator {

   290 private:

   291     /**

   292      * The data object is immutable, so we can freely share it with

   293      * other instances of RBT, as long as we do NOT own this object.

   294      *  TODO:  data is no longer immutable.  See bugs #1866, 2155

   295      */

   296     TransliterationRuleData* fData;

   298     /**

   299      * If true, we own the data object and must delete it.

   300      */

   301     UBool isDataOwned;

   303 public:

   305     /**

   306      * Constructs a new transliterator from the given rules.

   307      * @param rules rules, separated by ';'

   308      * @param direction either FORWARD or REVERSE.

   309      * @exception IllegalArgumentException if rules are malformed.

   310      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   311      */

   312     RuleBasedTransliterator(const UnicodeString& id,

   313                             const UnicodeString& rules,

   314                             UTransDirection direction,

   315                             UnicodeFilter* adoptedFilter,

   316                             UParseError& parseError,

   317                             UErrorCode& status);

   319     /**

   320      * Constructs a new transliterator from the given rules.

   321      * @param rules rules, separated by ';'

   322      * @param direction either FORWARD or REVERSE.

   323      * @exception IllegalArgumentException if rules are malformed.

   324      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   325      */

   326     /*RuleBasedTransliterator(const UnicodeString& id,

   327                             const UnicodeString& rules,

   328                             UTransDirection direction,

   329                             UnicodeFilter* adoptedFilter,

   330                             UErrorCode& status);*/

   332     /**

   333      * Covenience constructor with no filter.

   334      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   335      */

   336     /*RuleBasedTransliterator(const UnicodeString& id,

   337                             const UnicodeString& rules,

   338                             UTransDirection direction,

   339                             UErrorCode& status);*/

   341     /**

   342      * Covenience constructor with no filter and FORWARD direction.

   343      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   344      */

   345     /*RuleBasedTransliterator(const UnicodeString& id,

   346                             const UnicodeString& rules,

   347                             UErrorCode& status);*/

   349     /**

   350      * Covenience constructor with FORWARD direction.

   351      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   352      */

   353     /*RuleBasedTransliterator(const UnicodeString& id,

   354                             const UnicodeString& rules,

   355                             UnicodeFilter* adoptedFilter,

   356                             UErrorCode& status);*/

   357 private:

   359      friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor

   360     /**

   361      * Covenience constructor.

   362      * @param id            the id for the transliterator.

   363      * @param theData       the rule data for the transliterator.

   364      * @param adoptedFilter the filter for the transliterator

   365      */

   366     RuleBasedTransliterator(const UnicodeString& id,

   367                             const TransliterationRuleData* theData,

   368                             UnicodeFilter* adoptedFilter = 0);

   371     friend class Transliterator; // to access following ct

   373     /**

   374      * Internal constructor.

   375      * @param id            the id for the transliterator.

   376      * @param theData       the rule data for the transliterator.

   377      * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.

   378      */

   379     RuleBasedTransliterator(const UnicodeString& id,

   380                             TransliterationRuleData* data,

   381                             UBool isDataAdopted);

   383 public:

   385     /**

   386      * Copy constructor.

   387      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   388      */

   389     RuleBasedTransliterator(const RuleBasedTransliterator&);

   391     virtual ~RuleBasedTransliterator();

   393     /**

   394      * Implement Transliterator API.

   395      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   396      */

   397     virtual Transliterator* clone(void) const;

   399 protected:

   400     /**

   401      * Implements {@link Transliterator#handleTransliterate}.

   402      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   403      */

   404     virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,

   405                                      UBool isIncremental) const;

   407 public:

   408     /**

   409      * Return a representation of this transliterator as source rules.

   410      * These rules will produce an equivalent transliterator if used

   411      * to construct a new transliterator.

   412      * @param result the string to receive the rules.  Previous

   413      * contents will be deleted.

   414      * @param escapeUnprintable if TRUE then convert unprintable

   415      * character to their hex escape representations, \uxxxx or

   416      * \Uxxxxxxxx.  Unprintable characters are those other than

   417      * U+000A, U+0020..U+007E.

   418      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   419      */

   420     virtual UnicodeString& toRules(UnicodeString& result,

   421                                    UBool escapeUnprintable) const;

   423 protected:

   424     /**

   425      * Implement Transliterator framework

   426      */

   427     virtual void handleGetSourceSet(UnicodeSet& result) const;

   429 public:

   430     /**

   431      * Override Transliterator framework

   432      */

   433     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;

   435     /**

   436      * Return the class ID for this class.  This is useful only for

   437      * comparing to a return value from getDynamicClassID().  For example:

   438      * <pre>

   439      * .      Base* polymorphic_pointer = createPolymorphicObject();

   440      * .      if (polymorphic_pointer->getDynamicClassID() ==

   441      * .          Derived::getStaticClassID()) ...

   442      * </pre>

   443      * @return          The class ID for all objects of this class.

   444      * @internal Use transliterator factory methods instead since this class will be removed in that release.

   445      */

   446     U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void);

   448     /**

   449      * Returns a unique class ID <b>polymorphically</b>.  This method

   450      * is to implement a simple version of RTTI, since not all C++

   451      * compilers support genuine RTTI.  Polymorphic operator==() and

   452      * clone() methods call this method.

   453      *

   454      * @return The class ID for this object. All objects of a given

   455      * class have the same class ID.  Objects of other classes have

   456      * different class IDs.

   457      */

   458     virtual UClassID getDynamicClassID(void) const;

   460 private:

   462     void _construct(const UnicodeString& rules,

   463                     UTransDirection direction,

   464                     UParseError& parseError,

   465                     UErrorCode& status);

   466 };

   469 U_NAMESPACE_END

   471 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

   473 #endif

The Tor Browser / file revision

intl/icu/source/i18n/rbt.h@fc2d59ddac77

intl/icu/source/i18n/rbt.h