intl/icu/source/i18n/regexcst.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/regexcst.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,328 @@
     1.4 +#!/usr/bin/perl
     1.5 +#  ********************************************************************
     1.6 +#  * COPYRIGHT:
     1.7 +#  * Copyright (c) 2002-2007, International Business Machines Corporation and
     1.8 +#  * others. All Rights Reserved.
     1.9 +#  ********************************************************************
    1.10 +#
    1.11 +#  regexcst.pl
    1.12 +#            Compile the regular expression paser state table data into initialized C data.
    1.13 +#            Usage:
    1.14 +#                   cd icu/source/i18n
    1.15 +#                   perl regexcst.pl < regexcst.txt > regexcst.h
    1.16 +#
    1.17 +#             The output file, regexcst.h, is included by some of the .cpp regex
    1.18 +#             implementation files.   This perl script is NOT run as part
    1.19 +#             of a normal ICU build.  It is run by hand when needed, and the
    1.20 +#             regexcst.h generated file is put back into cvs.
    1.21 +#
    1.22 +#             See regexcst.txt for a description of the input format for this script.
    1.23 +#
    1.24 +#             This script is derived from rbbicst.pl, which peforms the same function
    1.25 +#             for the Rule Based Break Iterator Rule Parser.  Perhaps they could be
    1.26 +#             merged?
    1.27 +#
    1.28 +
    1.29 +
    1.30 +$num_states = 1;         # Always the state number for the line being compiled.
    1.31 +$line_num  = 0;          # The line number in the input file.
    1.32 +
    1.33 +$states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
    1.34 +                         # This prevents any state from being labelled with "pop",
    1.35 +                         #  and resolves references to "pop" in the next state field.
    1.36 +
    1.37 +line_loop: while (<>) {
    1.38 +    chomp();
    1.39 +    $line = $_;
    1.40 +    @fields = split();
    1.41 +    $line_num++;
    1.42 +
    1.43 +    # Remove # comments, which are any fields beginning with a #, plus all
    1.44 +    #  that follow on the line.
    1.45 +    for ($i=0; $i<@fields; $i++) {
    1.46 +        if ($fields[$i] =~ /^#/) {
    1.47 +            @fields = @fields[0 .. $i-1];
    1.48 +            last;
    1.49 +        }
    1.50 +    }
    1.51 +    # ignore blank lines, and those with no fields left after stripping comments..
    1.52 +    if (@fields == 0) {
    1.53 +        next;
    1.54 +    }
    1.55 +
    1.56 +    #
    1.57 +    # State Label:  handling.
    1.58 +    #    Does the first token end with a ":"?  If so, it's the name  of a state.
    1.59 +    #    Put in a hash, together with the current state number,
    1.60 +    #        so that we can later look up the number from the name.
    1.61 +    #
    1.62 +    if (@fields[0] =~ /.*:$/) {
    1.63 +        $state_name = @fields[0];
    1.64 +        $state_name =~ s/://;        # strip off the colon from the state name.
    1.65 +
    1.66 +        if ($states{$state_name} != 0) {
    1.67 +            print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
    1.68 +        }
    1.69 +        $states{$state_name} = $num_states;
    1.70 +        $stateNames[$num_states] = $state_name;
    1.71 +
    1.72 +        # if the label was the only thing on this line, go on to the next line,
    1.73 +        # otherwise assume that a state definition is on the same line and fall through.
    1.74 +        if (@fields == 1) {
    1.75 +            next line_loop;
    1.76 +        }
    1.77 +        shift @fields;                       # shift off label field in preparation
    1.78 +                                             #  for handling the rest of the line.
    1.79 +    }
    1.80 +
    1.81 +    #
    1.82 +    # State Transition line.
    1.83 +    #   syntax is this,
    1.84 +    #       character   [n]  target-state  [^push-state]  [function-name]
    1.85 +    #   where
    1.86 +    #      [something]   is an optional something
    1.87 +    #      character     is either a single quoted character e.g. '['
    1.88 +    #                       or a name of a character class, e.g. white_space
    1.89 +    #
    1.90 +
    1.91 +    $state_line_num[$num_states] = $line_num;   # remember line number with each state
    1.92 +                                                #  so we can make better error messages later.
    1.93 +    #
    1.94 +    # First field, character class or literal character for this transition.
    1.95 +    #
    1.96 +    if ($fields[0] =~ /^'.'$/) {
    1.97 +        # We've got a quoted literal character.
    1.98 +        $state_literal_chars[$num_states] = $fields[0];
    1.99 +        $state_literal_chars[$num_states] =~ s/'//g;
   1.100 +    } else {
   1.101 +        # We've got the name of a character class.
   1.102 +        $state_char_class[$num_states] = $fields[0];
   1.103 +        if ($fields[0] =~ /[\W]/) {
   1.104 +            print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
   1.105 +            print "     scanning $fields[0]\n";
   1.106 +            exit(-1);
   1.107 +        }
   1.108 +    }
   1.109 +    shift @fields;
   1.110 +
   1.111 +    #
   1.112 +    # do the 'n' flag
   1.113 +    #
   1.114 +    $state_flag[$num_states] = "FALSE";
   1.115 +    if ($fields[0] eq "n") {
   1.116 +        $state_flag[$num_states] = "TRUE";
   1.117 +        shift @fields;
   1.118 +    }
   1.119 +
   1.120 +    #
   1.121 +    # do the destination state.
   1.122 +    #
   1.123 +    $state_dest_state[$num_states] = $fields[0];
   1.124 +    if ($fields[0] eq "") {
   1.125 +        print "  rbbicsts:  at line $line_num, destination state missing.\n";
   1.126 +        exit(-1);
   1.127 +    }
   1.128 +    shift @fields;
   1.129 +
   1.130 +    #
   1.131 +    # do the push state, if present.
   1.132 +    #
   1.133 +    if ($fields[0] =~ /^\^/) {
   1.134 +        $fields[0] =~ s/^\^//;
   1.135 +        $state_push_state[$num_states] = $fields[0];
   1.136 +        if ($fields[0] eq "" ) {
   1.137 +            print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
   1.138 +            exit(-1);
   1.139 +        }
   1.140 +        shift @fields;
   1.141 +    }
   1.142 +
   1.143 +    #
   1.144 +    # Lastly, do the optional action name.
   1.145 +    #
   1.146 +    if ($fields[0] ne "") {
   1.147 +        $state_func_name[$num_states] = $fields[0];
   1.148 +        shift @fields;
   1.149 +    }
   1.150 +
   1.151 +    #
   1.152 +    #  There should be no fields left on the line at this point.
   1.153 +    #
   1.154 +    if (@fields > 0) {
   1.155 +       print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
   1.156 +       print "     scanning $fields[0]\n";
   1.157 +   }
   1.158 +   $num_states++;
   1.159 +}
   1.160 +
   1.161 +#
   1.162 +# We've read in the whole file, now go back and output the
   1.163 +#   C source code for the state transition table.
   1.164 +#
   1.165 +# We read all states first, before writing anything,  so that the state numbers
   1.166 +# for the destination states are all available to be written.
   1.167 +#
   1.168 +
   1.169 +#
   1.170 +# Make hashes for the names of the character classes and
   1.171 +#      for the names of the actions that appeared.
   1.172 +#
   1.173 +for ($state=1; $state < $num_states; $state++) {
   1.174 +    if ($state_char_class[$state] ne "") {
   1.175 +        if ($charClasses{$state_char_class[$state]} == 0) {
   1.176 +            $charClasses{$state_char_class[$state]} = 1;
   1.177 +        }
   1.178 +    }
   1.179 +    if ($state_func_name[$state] eq "") {
   1.180 +        $state_func_name[$state] = "doNOP";
   1.181 +    }
   1.182 +    if ($actions{$state_action_name[$state]} == 0) {
   1.183 +        $actions{$state_func_name[$state]} = 1;
   1.184 +    }
   1.185 +}
   1.186 +
   1.187 +#
   1.188 +# Check that all of the destination states have been defined
   1.189 +#
   1.190 +#
   1.191 +$states{"exit"} = 0;              # Predefined state name, terminates state machine.
   1.192 +for ($state=1; $state<$num_states; $state++) {
   1.193 +   if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
   1.194 +       print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
   1.195 +       $errors++;
   1.196 +   }
   1.197 +   if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
   1.198 +       print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
   1.199 +       $errors++;
   1.200 +   }
   1.201 +}
   1.202 +
   1.203 +die if ($errors>0);
   1.204 +
   1.205 +print "//---------------------------------------------------------------------------------\n";
   1.206 +print "//\n";
   1.207 +print "// Generated Header File.  Do not edit by hand.\n";
   1.208 +print "//    This file contains the state table for the ICU Regular Expression Pattern Parser\n";
   1.209 +print "//    It is generated by the Perl script \"regexcst.pl\" from\n";
   1.210 +print "//    the rule parser state definitions file \"regexcst.txt\".\n";
   1.211 +print "//\n";
   1.212 +print "//   Copyright (C) 2002-2007 International Business Machines Corporation \n";
   1.213 +print "//   and others. All rights reserved.  \n";
   1.214 +print "//\n";
   1.215 +print "//---------------------------------------------------------------------------------\n";
   1.216 +print "#ifndef RBBIRPT_H\n";
   1.217 +print "#define RBBIRPT_H\n";
   1.218 +print "\n";
   1.219 +print "U_NAMESPACE_BEGIN\n";
   1.220 +
   1.221 +#
   1.222 +# Emit the constants for indicies of Unicode Sets
   1.223 +#   Define one constant for each of the character classes encountered.
   1.224 +#   At the same time, store the index corresponding to the set name back into hash.
   1.225 +#
   1.226 +print "//\n";
   1.227 +print "// Character classes for regex pattern scanning.\n";
   1.228 +print "//\n";
   1.229 +$i = 128;                   # State Table values for Unicode char sets range from 128-250.
   1.230 +                            # Sets "default", "quoted", etc. get special handling.
   1.231 +                            #  They have no corresponding UnicodeSet object in the state machine,
   1.232 +                            #    but are handled by special case code.  So we emit no reference
   1.233 +                            #    to a UnicodeSet object to them here.
   1.234 +foreach $setName (keys %charClasses) {
   1.235 +    if ($setName eq "default") {
   1.236 +        $charClasses{$setName} = 255;}
   1.237 +    elsif ($setName eq "quoted") {
   1.238 +        $charClasses{$setName} = 254;}
   1.239 +    elsif ($setName eq "eof") {
   1.240 +        $charClasses{$setName} = 253;}
   1.241 +    else {
   1.242 +        # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
   1.243 +       print "    static const uint8_t kRuleSet_$setName = $i;\n";
   1.244 +        $charClasses{$setName} = $i;
   1.245 +        $i++;
   1.246 +    }
   1.247 +}
   1.248 +print "\n\n";
   1.249 +
   1.250 +#
   1.251 +# Emit the enum for the actions to be performed.
   1.252 +#
   1.253 +print "enum Regex_PatternParseAction {\n";
   1.254 +foreach $act (keys %actions) {
   1.255 +    print "    $act,\n";
   1.256 +}
   1.257 +print "    rbbiLastAction};\n\n";
   1.258 +
   1.259 +#
   1.260 +# Emit the struct definition for transtion table elements.
   1.261 +#
   1.262 +print "//-------------------------------------------------------------------------------\n";
   1.263 +print "//\n";
   1.264 +print "//  RegexTableEl       represents the structure of a row in the transition table\n";
   1.265 +print "//                     for the pattern parser state machine.\n";
   1.266 +print "//-------------------------------------------------------------------------------\n";
   1.267 +print "struct RegexTableEl {\n";
   1.268 +print "    Regex_PatternParseAction      fAction;\n";
   1.269 +print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
   1.270 +print "                                                    // 128-255:  character class index\n";
   1.271 +print "    uint8_t                       fNextState;       // 0-250:    normal next-state numbers\n";
   1.272 +print "                                                    // 255:      pop next-state from stack.\n";
   1.273 +print "    uint8_t                       fPushState;\n";
   1.274 +print "    UBool                         fNextChar;\n";
   1.275 +print "};\n\n";
   1.276 +
   1.277 +#
   1.278 +# emit the state transition table
   1.279 +#
   1.280 +print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
   1.281 +print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
   1.282 +for ($state=1; $state < $num_states; $state++) {
   1.283 +    print "    , {$state_func_name[$state],";
   1.284 +    if ($state_literal_chars[$state] ne "") {
   1.285 +        $c = $state_literal_chars[$state];
   1.286 +        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
   1.287 +    }else {
   1.288 +        print " $charClasses{$state_char_class[$state]},";
   1.289 +    }
   1.290 +    print " $states{$state_dest_state[$state]},";
   1.291 +
   1.292 +    # The push-state field is optional.  If omitted, fill field with a zero, which flags
   1.293 +    #   the state machine that there is no push state.
   1.294 +    if ($state_push_state[$state] eq "") {
   1.295 +        print "0, ";
   1.296 +    } else {
   1.297 +        print " $states{$state_push_state[$state]},";
   1.298 +    }
   1.299 +    print " $state_flag[$state]} ";
   1.300 +
   1.301 +    # Put out a C++ comment showing the number (index) of this state row,
   1.302 +    #   and, if this is the first row of the table for this state, the state name.
   1.303 +    print "    //  $state ";
   1.304 +    if ($stateNames[$state] ne "") {
   1.305 +        print "     $stateNames[$state]";
   1.306 +    }
   1.307 +    print "\n";
   1.308 +};
   1.309 +print " };\n";
   1.310 +
   1.311 +
   1.312 +#
   1.313 +# emit a mapping array from state numbers to state names.
   1.314 +#
   1.315 +#    This array is used for producing debugging output from the pattern parser.
   1.316 +#
   1.317 +print "static const char * const RegexStateNames[] = {";
   1.318 +for ($state=0; $state<$num_states; $state++) {
   1.319 +    if ($stateNames[$state] ne "") {
   1.320 +        print "     \"$stateNames[$state]\",\n";
   1.321 +    } else {
   1.322 +        print "    0,\n";
   1.323 +    }
   1.324 +}
   1.325 +print "    0};\n\n";
   1.326 +
   1.327 +print "U_NAMESPACE_END\n";
   1.328 +print "#endif\n";
   1.329 +
   1.330 +
   1.331 +

mercurial