intl/icu/source/common/rbbicst.pl

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rwxr-xr-x

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 #**************************************************************************
     2 #   Copyright (C) 2002-2005 International Business Machines Corporation   *
     3 #   and others. All rights reserved.                                      *
     4 #**************************************************************************
     5 #
     6 #  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
     7 #            Usage:
     8 #                   cd icu/source/common
     9 #                   perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
    10 #                   perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java
    11 #
    12 #             The output file, rbbrpt.h, is included by some of the .cpp rbbi
    13 #             implementation files.   This perl script is NOT run as part
    14 #             of a normal ICU build.  It is run by hand when needed, and the
    15 #             rbbirpt.h generated file is put back into cvs.
    16 #
    17 #             See rbbirpt.txt for a description of the input format for this script.
    18 #
    20 if ($ARGV[0] eq "-j") {
    21     $javaOutput = 1;
    22     shift @ARGV;
    23 }
    26 $num_states = 1;     # Always the state number for the line being compiled.
    27 $line_num  = 0;      # The line number in the input file.
    29 $states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
    30                          # This prevents any state from being labelled with "pop",
    31                          #  and resolves references to "pop" in the next state field.
    33 line_loop: while (<>) {
    34     chomp();
    35     $line = $_;
    36     @fields = split();
    37     $line_num++;
    39     # Remove # comments, which are any fields beginning with a #, plus all
    40     #  that follow on the line.
    41     for ($i=0; $i<@fields; $i++) {
    42         if ($fields[$i] =~ /^#/) {
    43             @fields = @fields[0 .. $i-1];
    44             last;
    45         }
    46     }
    47     # ignore blank lines, and those with no fields left after stripping comments..
    48     if (@fields == 0) {
    49         next;
    50     }
    52     #
    53     # State Label:  handling.
    54     #    Does the first token end with a ":"?  If so, it's the name  of a state.
    55     #    Put in a hash, together with the current state number,
    56     #        so that we can later look up the number from the name.
    57     #
    58     if (@fields[0] =~ /.*:$/) {
    59         $state_name = @fields[0];
    60         $state_name =~ s/://;        # strip off the colon from the state name.
    62         if ($states{$state_name} != 0) {
    63             print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
    64         }
    65         $states{$state_name} = $num_states;
    66         $stateNames[$num_states] = $state_name;
    68         # if the label was the only thing on this line, go on to the next line,
    69         # otherwise assume that a state definition is on the same line and fall through.
    70         if (@fields == 1) {
    71             next line_loop;
    72         }
    73         shift @fields;                       # shift off label field in preparation
    74                                              #  for handling the rest of the line.
    75     }
    77     #
    78     # State Transition line.
    79     #   syntax is this,
    80     #       character   [n]  target-state  [^push-state]  [function-name]
    81     #   where
    82     #      [something]   is an optional something
    83     #      character     is either a single quoted character e.g. '['
    84     #                       or a name of a character class, e.g. white_space
    85     #
    87     $state_line_num[$num_states] = $line_num;   # remember line number with each state
    88                                                 #  so we can make better error messages later.
    89     #
    90     # First field, character class or literal character for this transition.
    91     #
    92     if ($fields[0] =~ /^'.'$/) {
    93         # We've got a quoted literal character.
    94         $state_literal_chars[$num_states] = $fields[0];
    95         $state_literal_chars[$num_states] =~ s/'//g;
    96     } else {
    97         # We've got the name of a character class.
    98         $state_char_class[$num_states] = $fields[0];
    99         if ($fields[0] =~ /[\W]/) {
   100             print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
   101             print "     scanning $fields[0]\n";
   102             exit(-1);
   103         }
   104     }
   105     shift @fields;
   107     #
   108     # do the 'n' flag
   109     #
   110     $state_flag[$num_states] = $javaOutput? "false" : "FALSE";
   111     if ($fields[0] eq "n") {
   112         $state_flag[$num_states] = $javaOutput? "true": "TRUE";
   113         shift @fields;
   114     }
   116     #
   117     # do the destination state.
   118     #
   119     $state_dest_state[$num_states] = $fields[0];
   120     if ($fields[0] eq "") {
   121         print "  rbbicsts:  at line $line_num, destination state missing.\n";
   122         exit(-1);
   123     }
   124     shift @fields;
   126     #
   127     # do the push state, if present.
   128     #
   129     if ($fields[0] =~ /^\^/) {
   130         $fields[0] =~ s/^\^//;
   131         $state_push_state[$num_states] = $fields[0];
   132         if ($fields[0] eq "" ) {
   133             print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
   134             exit(-1);
   135         }
   136         shift @fields;
   137     }
   139     #
   140     # Lastly, do the optional action name.
   141     #
   142     if ($fields[0] ne "") {
   143         $state_func_name[$num_states] = $fields[0];
   144         shift @fields;
   145     }
   147     #
   148     #  There should be no fields left on the line at this point.
   149     #
   150     if (@fields > 0) {
   151        print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
   152        print "     scanning $fields[0]\n";
   153    }
   154    $num_states++;
   155 }
   157 #
   158 # We've read in the whole file, now go back and output the
   159 #   C source code for the state transition table.
   160 #
   161 # We read all states first, before writing anything,  so that the state numbers
   162 # for the destination states are all available to be written.
   163 #
   165 #
   166 # Make hashes for the names of the character classes and
   167 #      for the names of the actions that appeared.
   168 #
   169 for ($state=1; $state < $num_states; $state++) {
   170     if ($state_char_class[$state] ne "") {
   171         if ($charClasses{$state_char_class[$state]} == 0) {
   172             $charClasses{$state_char_class[$state]} = 1;
   173         }
   174     }
   175     if ($state_func_name[$state] eq "") {
   176         $state_func_name[$state] = "doNOP";
   177     }
   178     if ($actions{$state_action_name[$state]} == 0) {
   179         $actions{$state_func_name[$state]} = 1;
   180     }
   181 }
   183 #
   184 # Check that all of the destination states have been defined
   185 #
   186 #
   187 $states{"exit"} = 0;              # Predefined state name, terminates state machine.
   188 for ($state=1; $state<$num_states; $state++) {
   189    if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
   190        print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
   191        $errors++;
   192    }
   193    if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
   194        print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
   195        $errors++;
   196    }
   197 }
   199 die if ($errors>0);
   201 #
   202 # Assign numbers to each of the character classes classes  used.
   203 #   Sets are numbered from 128 - 250
   204 #   The values 0-127 in the state table are used for matching
   205 #     individual ASCII characters (the only thing that can appear in the rules.)
   206 #   The "set" names appearing in the code below (default, etc.)  need special
   207 #     handling because they do not correspond to a normal set of characters,
   208 #     but trigger special handling by code in the state machine.
   209 #
   210 $i = 128;
   211 foreach $setName (sort keys %charClasses) {
   212     if ($setName eq "default") {
   213         $charClasses{$setName} = 255;}
   214     elsif ($setName eq "escaped") {
   215         $charClasses{$setName} = 254;}
   216     elsif ($setName eq "escapedP") {
   217         $charClasses{$setName} = 253;}
   218     elsif ($setName eq "eof") {
   219         $charClasses{$setName} = 252;}
   220     else {
   221         # Normal (single) character class.  Number them.
   222         $charClasses{$setName} = $i;
   223         $i++;
   224     }
   225 }
   228 my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
   229 $year += 1900;
   231 if ($javaOutput) {
   232     print "/*\n";
   233     print " *******************************************************************************\n";
   234     print " * Copyright (C) 2003-$year,\n";
   235     print " * International Business Machines Corporation and others. All Rights Reserved.\n";
   236     print " *******************************************************************************\n";
   237     print " */\n";
   238     print " \n";
   239     print "package com.ibm.icu.text;\n";
   240     print " \n";
   241     print "/**\n";
   242     print " * Generated Java File.  Do not edit by hand.\n";
   243     print " * This file contains the state table for the ICU Rule Based Break Iterator\n";
   244     print " * rule parser.\n";
   245     print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
   246     print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
   247     print " * \@internal \n";
   248     print " *\n";
   249     print " */\n";
   251     print "class RBBIRuleParseTable\n";
   252     print "{\n";
   254      #
   255     # Emit the constants for the actions to be performed.
   256     #
   257     $n = 1;
   258     foreach $act (sort keys %actions) {
   259         print "     static final short $act = $n;\n";
   260         $n++;
   261     }
   262     print " \n";
   264     #
   265     # Emit constants for char class names
   266     #
   267     foreach $setName (sort keys %charClasses) {
   268        print "     static final short kRuleSet_$setName = $charClasses{$setName};\n";
   269     }
   270     print "\n\n";
   273     print "   static class RBBIRuleTableElement { \n";
   274     print "      short      fAction; \n";
   275     print "      short      fCharClass; \n";
   276     print "      short      fNextState; \n";
   277     print "      short      fPushState; \n";
   278     print "      boolean    fNextChar;  \n";
   279     print "      String     fStateName; \n";
   280     print "      RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) {  \n";
   281     print "      fAction = a; \n";
   282     print "      fCharClass = (short)cc; \n";
   283     print "      fNextState = (short)ns; \n";
   284     print "      fPushState = (short)ps; \n";
   285     print "      fNextChar  = nc; \n";
   286     print "      fStateName = sn; \n";
   287     print "   } \n";
   288     print "   }; \n";
   289     print "  \n";
   292     print "    static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";
   293     print "      new RBBIRuleTableElement(doNOP, 0, 0,0,  true,   null )     //  0 \n";  #output the unused state 0. 
   294     for ($state=1; $state < $num_states; $state++) {
   295         print "     , new RBBIRuleTableElement($state_func_name[$state],";
   296         if ($state_literal_chars[$state] ne "") {
   297             $c = $state_literal_chars[$state];
   298             print("'$c', "); 
   299         }else {
   300             print " $charClasses{$state_char_class[$state]},";
   301         }
   302         print " $states{$state_dest_state[$state]},";
   304         # The push-state field is optional.  If omitted, fill field with a zero, which flags
   305         #   the state machine that there is no push state.
   306         if ($state_push_state[$state] eq "") {
   307             print "0, ";
   308         } else {
   309             print " $states{$state_push_state[$state]},";
   310         }
   311         print " $state_flag[$state], ";
   313         # if this is the first row of the table for this state, put out the state name.
   314         if ($stateNames[$state] ne "") {
   315             print "  \"$stateNames[$state]\") ";
   316         } else {
   317             print "  null ) ";
   318         }
   320         # Put out a comment showing the number (index) of this state row,
   321         print "    //  $state ";
   322         print "\n";
   323     }
   324     print " };\n";
   326     print "}; \n";
   328 }
   329 else
   330 {
   331     #
   332     #  C++ Output ...
   333     #
   336     print "//---------------------------------------------------------------------------------\n";
   337     print "//\n";
   338     print "// Generated Header File.  Do not edit by hand.\n";
   339     print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
   340     print "//    rule parser.\n";
   341     print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
   342     print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
   343     print "//\n";
   344     print "//   Copyright (C) 2002-$year International Business Machines Corporation \n";
   345     print "//   and others. All rights reserved.  \n";
   346     print "//\n";
   347     print "//---------------------------------------------------------------------------------\n";
   348     print "#ifndef RBBIRPT_H\n";
   349     print "#define RBBIRPT_H\n";
   350     print "\n";
   351     print "U_NAMESPACE_BEGIN\n";
   353     #
   354     # Emit the constants for indicies of Unicode Sets
   355     #   Define one constant for each of the character classes encountered.
   356     #   At the same time, store the index corresponding to the set name back into hash.
   357     #
   358     print "//\n";
   359     print "// Character classes for RBBI rule scanning.\n";
   360     print "//\n";
   361     foreach $setName (sort keys %charClasses) {
   362         if ($charClasses{$setName} < 250) {
   363            # Normal character class.
   364            print "    static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";
   365         }
   366     }
   367     print "\n\n";
   369     #
   370     # Emit the enum for the actions to be performed.
   371     #
   372     print "enum RBBI_RuleParseAction {\n";
   373     foreach $act (sort keys %actions) {
   374         print "    $act,\n";
   375     }
   376     print "    rbbiLastAction};\n\n";
   378     #
   379     # Emit the struct definition for transtion table elements.
   380     #
   381     print "//-------------------------------------------------------------------------------\n";
   382     print "//\n";
   383     print "//  RBBIRuleTableEl    represents the structure of a row in the transition table\n";
   384     print "//                     for the rule parser state machine.\n";
   385     print "//-------------------------------------------------------------------------------\n";
   386     print "struct RBBIRuleTableEl {\n";
   387     print "    RBBI_RuleParseAction          fAction;\n";
   388     print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
   389     print "                                                    // 128-255:  character class index\n";
   390     print "    uint8_t                       fNextState;       // 0-250:    normal next-stat numbers\n";
   391     print "                                                    // 255:      pop next-state from stack.\n";
   392     print "    uint8_t                       fPushState;\n";
   393     print "    UBool                         fNextChar;\n";
   394     print "};\n\n";
   396     #
   397     # emit the state transition table
   398     #
   399     print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
   400     print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
   401     for ($state=1; $state < $num_states; $state++) {
   402         print "    , {$state_func_name[$state],";
   403         if ($state_literal_chars[$state] ne "") {
   404             $c = $state_literal_chars[$state];
   405             printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
   406         }else {
   407             print " $charClasses{$state_char_class[$state]},";
   408         }
   409         print " $states{$state_dest_state[$state]},";
   411         # The push-state field is optional.  If omitted, fill field with a zero, which flags
   412         #   the state machine that there is no push state.
   413         if ($state_push_state[$state] eq "") {
   414             print "0, ";
   415         } else {
   416             print " $states{$state_push_state[$state]},";
   417         }
   418         print " $state_flag[$state]} ";
   420         # Put out a C++ comment showing the number (index) of this state row,
   421         #   and, if this is the first row of the table for this state, the state name.
   422         print "    //  $state ";
   423         if ($stateNames[$state] ne "") {
   424             print "     $stateNames[$state]";
   425         }
   426         print "\n";
   427     };
   428     print " };\n";
   431     #
   432     # emit a mapping array from state numbers to state names.
   433     #
   434     #    This array is used for producing debugging output from the rule parser.
   435     #
   436     print "#ifdef RBBI_DEBUG\n";
   437     print "static const char * const RBBIRuleStateNames[] = {";
   438     for ($state=0; $state<$num_states; $state++) {
   439         if ($stateNames[$state] ne "") {
   440             print "     \"$stateNames[$state]\",\n";
   441         } else {
   442             print "    0,\n";
   443         }
   444     }
   445     print "    0};\n";
   446     print "#endif\n\n";
   448     print "U_NAMESPACE_END\n";
   449     print "#endif\n";
   450 }

mercurial