intl/icu/source/common/rbbicst.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbicst.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,453 @@
     1.4 +#**************************************************************************
     1.5 +#   Copyright (C) 2002-2005 International Business Machines Corporation   *
     1.6 +#   and others. All rights reserved.                                      *
     1.7 +#**************************************************************************
     1.8 +#
     1.9 +#  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
    1.10 +#            Usage:
    1.11 +#                   cd icu/source/common
    1.12 +#                   perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
    1.13 +#                   perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java
    1.14 +#
    1.15 +#             The output file, rbbrpt.h, is included by some of the .cpp rbbi
    1.16 +#             implementation files.   This perl script is NOT run as part
    1.17 +#             of a normal ICU build.  It is run by hand when needed, and the
    1.18 +#             rbbirpt.h generated file is put back into cvs.
    1.19 +#
    1.20 +#             See rbbirpt.txt for a description of the input format for this script.
    1.21 +#
    1.22 +
    1.23 +if ($ARGV[0] eq "-j") {
    1.24 +    $javaOutput = 1;
    1.25 +    shift @ARGV;
    1.26 +}
    1.27 +
    1.28 +
    1.29 +$num_states = 1;     # Always the state number for the line being compiled.
    1.30 +$line_num  = 0;      # The line number in the input file.
    1.31 +
    1.32 +$states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
    1.33 +                         # This prevents any state from being labelled with "pop",
    1.34 +                         #  and resolves references to "pop" in the next state field.
    1.35 +
    1.36 +line_loop: while (<>) {
    1.37 +    chomp();
    1.38 +    $line = $_;
    1.39 +    @fields = split();
    1.40 +    $line_num++;
    1.41 +
    1.42 +    # Remove # comments, which are any fields beginning with a #, plus all
    1.43 +    #  that follow on the line.
    1.44 +    for ($i=0; $i<@fields; $i++) {
    1.45 +        if ($fields[$i] =~ /^#/) {
    1.46 +            @fields = @fields[0 .. $i-1];
    1.47 +            last;
    1.48 +        }
    1.49 +    }
    1.50 +    # ignore blank lines, and those with no fields left after stripping comments..
    1.51 +    if (@fields == 0) {
    1.52 +        next;
    1.53 +    }
    1.54 +
    1.55 +    #
    1.56 +    # State Label:  handling.
    1.57 +    #    Does the first token end with a ":"?  If so, it's the name  of a state.
    1.58 +    #    Put in a hash, together with the current state number,
    1.59 +    #        so that we can later look up the number from the name.
    1.60 +    #
    1.61 +    if (@fields[0] =~ /.*:$/) {
    1.62 +        $state_name = @fields[0];
    1.63 +        $state_name =~ s/://;        # strip off the colon from the state name.
    1.64 +
    1.65 +        if ($states{$state_name} != 0) {
    1.66 +            print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
    1.67 +        }
    1.68 +        $states{$state_name} = $num_states;
    1.69 +        $stateNames[$num_states] = $state_name;
    1.70 +
    1.71 +        # if the label was the only thing on this line, go on to the next line,
    1.72 +        # otherwise assume that a state definition is on the same line and fall through.
    1.73 +        if (@fields == 1) {
    1.74 +            next line_loop;
    1.75 +        }
    1.76 +        shift @fields;                       # shift off label field in preparation
    1.77 +                                             #  for handling the rest of the line.
    1.78 +    }
    1.79 +
    1.80 +    #
    1.81 +    # State Transition line.
    1.82 +    #   syntax is this,
    1.83 +    #       character   [n]  target-state  [^push-state]  [function-name]
    1.84 +    #   where
    1.85 +    #      [something]   is an optional something
    1.86 +    #      character     is either a single quoted character e.g. '['
    1.87 +    #                       or a name of a character class, e.g. white_space
    1.88 +    #
    1.89 +
    1.90 +    $state_line_num[$num_states] = $line_num;   # remember line number with each state
    1.91 +                                                #  so we can make better error messages later.
    1.92 +    #
    1.93 +    # First field, character class or literal character for this transition.
    1.94 +    #
    1.95 +    if ($fields[0] =~ /^'.'$/) {
    1.96 +        # We've got a quoted literal character.
    1.97 +        $state_literal_chars[$num_states] = $fields[0];
    1.98 +        $state_literal_chars[$num_states] =~ s/'//g;
    1.99 +    } else {
   1.100 +        # We've got the name of a character class.
   1.101 +        $state_char_class[$num_states] = $fields[0];
   1.102 +        if ($fields[0] =~ /[\W]/) {
   1.103 +            print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
   1.104 +            print "     scanning $fields[0]\n";
   1.105 +            exit(-1);
   1.106 +        }
   1.107 +    }
   1.108 +    shift @fields;
   1.109 +
   1.110 +    #
   1.111 +    # do the 'n' flag
   1.112 +    #
   1.113 +    $state_flag[$num_states] = $javaOutput? "false" : "FALSE";
   1.114 +    if ($fields[0] eq "n") {
   1.115 +        $state_flag[$num_states] = $javaOutput? "true": "TRUE";
   1.116 +        shift @fields;
   1.117 +    }
   1.118 +
   1.119 +    #
   1.120 +    # do the destination state.
   1.121 +    #
   1.122 +    $state_dest_state[$num_states] = $fields[0];
   1.123 +    if ($fields[0] eq "") {
   1.124 +        print "  rbbicsts:  at line $line_num, destination state missing.\n";
   1.125 +        exit(-1);
   1.126 +    }
   1.127 +    shift @fields;
   1.128 +
   1.129 +    #
   1.130 +    # do the push state, if present.
   1.131 +    #
   1.132 +    if ($fields[0] =~ /^\^/) {
   1.133 +        $fields[0] =~ s/^\^//;
   1.134 +        $state_push_state[$num_states] = $fields[0];
   1.135 +        if ($fields[0] eq "" ) {
   1.136 +            print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
   1.137 +            exit(-1);
   1.138 +        }
   1.139 +        shift @fields;
   1.140 +    }
   1.141 +
   1.142 +    #
   1.143 +    # Lastly, do the optional action name.
   1.144 +    #
   1.145 +    if ($fields[0] ne "") {
   1.146 +        $state_func_name[$num_states] = $fields[0];
   1.147 +        shift @fields;
   1.148 +    }
   1.149 +
   1.150 +    #
   1.151 +    #  There should be no fields left on the line at this point.
   1.152 +    #
   1.153 +    if (@fields > 0) {
   1.154 +       print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
   1.155 +       print "     scanning $fields[0]\n";
   1.156 +   }
   1.157 +   $num_states++;
   1.158 +}
   1.159 +
   1.160 +#
   1.161 +# We've read in the whole file, now go back and output the
   1.162 +#   C source code for the state transition table.
   1.163 +#
   1.164 +# We read all states first, before writing anything,  so that the state numbers
   1.165 +# for the destination states are all available to be written.
   1.166 +#
   1.167 +
   1.168 +#
   1.169 +# Make hashes for the names of the character classes and
   1.170 +#      for the names of the actions that appeared.
   1.171 +#
   1.172 +for ($state=1; $state < $num_states; $state++) {
   1.173 +    if ($state_char_class[$state] ne "") {
   1.174 +        if ($charClasses{$state_char_class[$state]} == 0) {
   1.175 +            $charClasses{$state_char_class[$state]} = 1;
   1.176 +        }
   1.177 +    }
   1.178 +    if ($state_func_name[$state] eq "") {
   1.179 +        $state_func_name[$state] = "doNOP";
   1.180 +    }
   1.181 +    if ($actions{$state_action_name[$state]} == 0) {
   1.182 +        $actions{$state_func_name[$state]} = 1;
   1.183 +    }
   1.184 +}
   1.185 +
   1.186 +#
   1.187 +# Check that all of the destination states have been defined
   1.188 +#
   1.189 +#
   1.190 +$states{"exit"} = 0;              # Predefined state name, terminates state machine.
   1.191 +for ($state=1; $state<$num_states; $state++) {
   1.192 +   if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
   1.193 +       print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
   1.194 +       $errors++;
   1.195 +   }
   1.196 +   if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
   1.197 +       print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
   1.198 +       $errors++;
   1.199 +   }
   1.200 +}
   1.201 +
   1.202 +die if ($errors>0);
   1.203 +
   1.204 +#
   1.205 +# Assign numbers to each of the character classes classes  used.
   1.206 +#   Sets are numbered from 128 - 250
   1.207 +#   The values 0-127 in the state table are used for matching
   1.208 +#     individual ASCII characters (the only thing that can appear in the rules.)
   1.209 +#   The "set" names appearing in the code below (default, etc.)  need special
   1.210 +#     handling because they do not correspond to a normal set of characters,
   1.211 +#     but trigger special handling by code in the state machine.
   1.212 +#
   1.213 +$i = 128;
   1.214 +foreach $setName (sort keys %charClasses) {
   1.215 +    if ($setName eq "default") {
   1.216 +        $charClasses{$setName} = 255;}
   1.217 +    elsif ($setName eq "escaped") {
   1.218 +        $charClasses{$setName} = 254;}
   1.219 +    elsif ($setName eq "escapedP") {
   1.220 +        $charClasses{$setName} = 253;}
   1.221 +    elsif ($setName eq "eof") {
   1.222 +        $charClasses{$setName} = 252;}
   1.223 +    else {
   1.224 +        # Normal (single) character class.  Number them.
   1.225 +        $charClasses{$setName} = $i;
   1.226 +        $i++;
   1.227 +    }
   1.228 +}
   1.229 +
   1.230 +
   1.231 +my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
   1.232 +$year += 1900;
   1.233 +
   1.234 +if ($javaOutput) {
   1.235 +    print "/*\n";
   1.236 +    print " *******************************************************************************\n";
   1.237 +    print " * Copyright (C) 2003-$year,\n";
   1.238 +    print " * International Business Machines Corporation and others. All Rights Reserved.\n";
   1.239 +    print " *******************************************************************************\n";
   1.240 +    print " */\n";
   1.241 +    print " \n";
   1.242 +    print "package com.ibm.icu.text;\n";
   1.243 +    print " \n";
   1.244 +    print "/**\n";
   1.245 +    print " * Generated Java File.  Do not edit by hand.\n";
   1.246 +    print " * This file contains the state table for the ICU Rule Based Break Iterator\n";
   1.247 +    print " * rule parser.\n";
   1.248 +    print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
   1.249 +    print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
   1.250 +    print " * \@internal \n";
   1.251 +    print " *\n";
   1.252 +    print " */\n";
   1.253 +
   1.254 +    print "class RBBIRuleParseTable\n";
   1.255 +    print "{\n";
   1.256 +
   1.257 +     #
   1.258 +    # Emit the constants for the actions to be performed.
   1.259 +    #
   1.260 +    $n = 1;
   1.261 +    foreach $act (sort keys %actions) {
   1.262 +        print "     static final short $act = $n;\n";
   1.263 +        $n++;
   1.264 +    }
   1.265 +    print " \n";
   1.266 +    
   1.267 +    #
   1.268 +    # Emit constants for char class names
   1.269 +    #
   1.270 +    foreach $setName (sort keys %charClasses) {
   1.271 +       print "     static final short kRuleSet_$setName = $charClasses{$setName};\n";
   1.272 +    }
   1.273 +    print "\n\n";
   1.274 +    
   1.275 +    
   1.276 +    print "   static class RBBIRuleTableElement { \n";
   1.277 +    print "      short      fAction; \n";
   1.278 +    print "      short      fCharClass; \n";
   1.279 +    print "      short      fNextState; \n";
   1.280 +    print "      short      fPushState; \n";
   1.281 +    print "      boolean    fNextChar;  \n";
   1.282 +    print "      String     fStateName; \n";
   1.283 +    print "      RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) {  \n";
   1.284 +    print "      fAction = a; \n";
   1.285 +    print "      fCharClass = (short)cc; \n";
   1.286 +    print "      fNextState = (short)ns; \n";
   1.287 +    print "      fPushState = (short)ps; \n";
   1.288 +    print "      fNextChar  = nc; \n";
   1.289 +    print "      fStateName = sn; \n";
   1.290 +    print "   } \n";
   1.291 +    print "   }; \n";
   1.292 +    print "  \n";
   1.293 +    
   1.294 +    
   1.295 +    print "    static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";
   1.296 +    print "      new RBBIRuleTableElement(doNOP, 0, 0,0,  true,   null )     //  0 \n";  #output the unused state 0. 
   1.297 +    for ($state=1; $state < $num_states; $state++) {
   1.298 +        print "     , new RBBIRuleTableElement($state_func_name[$state],";
   1.299 +        if ($state_literal_chars[$state] ne "") {
   1.300 +            $c = $state_literal_chars[$state];
   1.301 +            print("'$c', "); 
   1.302 +        }else {
   1.303 +            print " $charClasses{$state_char_class[$state]},";
   1.304 +        }
   1.305 +        print " $states{$state_dest_state[$state]},";
   1.306 + 
   1.307 +        # The push-state field is optional.  If omitted, fill field with a zero, which flags
   1.308 +        #   the state machine that there is no push state.
   1.309 +        if ($state_push_state[$state] eq "") {
   1.310 +            print "0, ";
   1.311 +        } else {
   1.312 +            print " $states{$state_push_state[$state]},";
   1.313 +        }
   1.314 +        print " $state_flag[$state], ";
   1.315 + 
   1.316 +        # if this is the first row of the table for this state, put out the state name.
   1.317 +        if ($stateNames[$state] ne "") {
   1.318 +            print "  \"$stateNames[$state]\") ";
   1.319 +        } else {
   1.320 +            print "  null ) ";
   1.321 +        }
   1.322 +            
   1.323 +        # Put out a comment showing the number (index) of this state row,
   1.324 +        print "    //  $state ";
   1.325 +        print "\n";
   1.326 +    }
   1.327 +    print " };\n";
   1.328 +
   1.329 +    print "}; \n";
   1.330 +    
   1.331 +}
   1.332 +else
   1.333 +{
   1.334 +    #
   1.335 +    #  C++ Output ...
   1.336 +    #
   1.337 +
   1.338 +
   1.339 +    print "//---------------------------------------------------------------------------------\n";
   1.340 +    print "//\n";
   1.341 +    print "// Generated Header File.  Do not edit by hand.\n";
   1.342 +    print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
   1.343 +    print "//    rule parser.\n";
   1.344 +    print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
   1.345 +    print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
   1.346 +    print "//\n";
   1.347 +    print "//   Copyright (C) 2002-$year International Business Machines Corporation \n";
   1.348 +    print "//   and others. All rights reserved.  \n";
   1.349 +    print "//\n";
   1.350 +    print "//---------------------------------------------------------------------------------\n";
   1.351 +    print "#ifndef RBBIRPT_H\n";
   1.352 +    print "#define RBBIRPT_H\n";
   1.353 +    print "\n";
   1.354 +    print "U_NAMESPACE_BEGIN\n";
   1.355 +
   1.356 +    #
   1.357 +    # Emit the constants for indicies of Unicode Sets
   1.358 +    #   Define one constant for each of the character classes encountered.
   1.359 +    #   At the same time, store the index corresponding to the set name back into hash.
   1.360 +    #
   1.361 +    print "//\n";
   1.362 +    print "// Character classes for RBBI rule scanning.\n";
   1.363 +    print "//\n";
   1.364 +    foreach $setName (sort keys %charClasses) {
   1.365 +        if ($charClasses{$setName} < 250) {
   1.366 +           # Normal character class.
   1.367 +           print "    static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";
   1.368 +        }
   1.369 +    }
   1.370 +    print "\n\n";
   1.371 +
   1.372 +    #
   1.373 +    # Emit the enum for the actions to be performed.
   1.374 +    #
   1.375 +    print "enum RBBI_RuleParseAction {\n";
   1.376 +    foreach $act (sort keys %actions) {
   1.377 +        print "    $act,\n";
   1.378 +    }
   1.379 +    print "    rbbiLastAction};\n\n";
   1.380 +
   1.381 +    #
   1.382 +    # Emit the struct definition for transtion table elements.
   1.383 +    #
   1.384 +    print "//-------------------------------------------------------------------------------\n";
   1.385 +    print "//\n";
   1.386 +    print "//  RBBIRuleTableEl    represents the structure of a row in the transition table\n";
   1.387 +    print "//                     for the rule parser state machine.\n";
   1.388 +    print "//-------------------------------------------------------------------------------\n";
   1.389 +    print "struct RBBIRuleTableEl {\n";
   1.390 +    print "    RBBI_RuleParseAction          fAction;\n";
   1.391 +    print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
   1.392 +    print "                                                    // 128-255:  character class index\n";
   1.393 +    print "    uint8_t                       fNextState;       // 0-250:    normal next-stat numbers\n";
   1.394 +    print "                                                    // 255:      pop next-state from stack.\n";
   1.395 +    print "    uint8_t                       fPushState;\n";
   1.396 +    print "    UBool                         fNextChar;\n";
   1.397 +    print "};\n\n";
   1.398 +
   1.399 +    #
   1.400 +    # emit the state transition table
   1.401 +    #
   1.402 +    print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
   1.403 +    print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
   1.404 +    for ($state=1; $state < $num_states; $state++) {
   1.405 +        print "    , {$state_func_name[$state],";
   1.406 +        if ($state_literal_chars[$state] ne "") {
   1.407 +            $c = $state_literal_chars[$state];
   1.408 +            printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
   1.409 +        }else {
   1.410 +            print " $charClasses{$state_char_class[$state]},";
   1.411 +        }
   1.412 +        print " $states{$state_dest_state[$state]},";
   1.413 +
   1.414 +        # The push-state field is optional.  If omitted, fill field with a zero, which flags
   1.415 +        #   the state machine that there is no push state.
   1.416 +        if ($state_push_state[$state] eq "") {
   1.417 +            print "0, ";
   1.418 +        } else {
   1.419 +            print " $states{$state_push_state[$state]},";
   1.420 +        }
   1.421 +        print " $state_flag[$state]} ";
   1.422 +
   1.423 +        # Put out a C++ comment showing the number (index) of this state row,
   1.424 +        #   and, if this is the first row of the table for this state, the state name.
   1.425 +        print "    //  $state ";
   1.426 +        if ($stateNames[$state] ne "") {
   1.427 +            print "     $stateNames[$state]";
   1.428 +        }
   1.429 +        print "\n";
   1.430 +    };
   1.431 +    print " };\n";
   1.432 +
   1.433 +
   1.434 +    #
   1.435 +    # emit a mapping array from state numbers to state names.
   1.436 +    #
   1.437 +    #    This array is used for producing debugging output from the rule parser.
   1.438 +    #
   1.439 +    print "#ifdef RBBI_DEBUG\n";
   1.440 +    print "static const char * const RBBIRuleStateNames[] = {";
   1.441 +    for ($state=0; $state<$num_states; $state++) {
   1.442 +        if ($stateNames[$state] ne "") {
   1.443 +            print "     \"$stateNames[$state]\",\n";
   1.444 +        } else {
   1.445 +            print "    0,\n";
   1.446 +        }
   1.447 +    }
   1.448 +    print "    0};\n";
   1.449 +    print "#endif\n\n";
   1.450 +
   1.451 +    print "U_NAMESPACE_END\n";
   1.452 +    print "#endif\n";
   1.453 +}
   1.454 +
   1.455 +
   1.456 +

mercurial