intl/icu/source/i18n/regexcst.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rwxr-xr-x

Correct previous dual key logic pending first delivery installment.

michael@0 1 #!/usr/bin/perl
michael@0 2 # ********************************************************************
michael@0 3 # * COPYRIGHT:
michael@0 4 # * Copyright (c) 2002-2007, International Business Machines Corporation and
michael@0 5 # * others. All Rights Reserved.
michael@0 6 # ********************************************************************
michael@0 7 #
michael@0 8 # regexcst.pl
michael@0 9 # Compile the regular expression paser state table data into initialized C data.
michael@0 10 # Usage:
michael@0 11 # cd icu/source/i18n
michael@0 12 # perl regexcst.pl < regexcst.txt > regexcst.h
michael@0 13 #
michael@0 14 # The output file, regexcst.h, is included by some of the .cpp regex
michael@0 15 # implementation files. This perl script is NOT run as part
michael@0 16 # of a normal ICU build. It is run by hand when needed, and the
michael@0 17 # regexcst.h generated file is put back into cvs.
michael@0 18 #
michael@0 19 # See regexcst.txt for a description of the input format for this script.
michael@0 20 #
michael@0 21 # This script is derived from rbbicst.pl, which peforms the same function
michael@0 22 # for the Rule Based Break Iterator Rule Parser. Perhaps they could be
michael@0 23 # merged?
michael@0 24 #
michael@0 25
michael@0 26
michael@0 27 $num_states = 1; # Always the state number for the line being compiled.
michael@0 28 $line_num = 0; # The line number in the input file.
michael@0 29
michael@0 30 $states{"pop"} = 255; # Add the "pop" to the list of defined state names.
michael@0 31 # This prevents any state from being labelled with "pop",
michael@0 32 # and resolves references to "pop" in the next state field.
michael@0 33
michael@0 34 line_loop: while (<>) {
michael@0 35 chomp();
michael@0 36 $line = $_;
michael@0 37 @fields = split();
michael@0 38 $line_num++;
michael@0 39
michael@0 40 # Remove # comments, which are any fields beginning with a #, plus all
michael@0 41 # that follow on the line.
michael@0 42 for ($i=0; $i<@fields; $i++) {
michael@0 43 if ($fields[$i] =~ /^#/) {
michael@0 44 @fields = @fields[0 .. $i-1];
michael@0 45 last;
michael@0 46 }
michael@0 47 }
michael@0 48 # ignore blank lines, and those with no fields left after stripping comments..
michael@0 49 if (@fields == 0) {
michael@0 50 next;
michael@0 51 }
michael@0 52
michael@0 53 #
michael@0 54 # State Label: handling.
michael@0 55 # Does the first token end with a ":"? If so, it's the name of a state.
michael@0 56 # Put in a hash, together with the current state number,
michael@0 57 # so that we can later look up the number from the name.
michael@0 58 #
michael@0 59 if (@fields[0] =~ /.*:$/) {
michael@0 60 $state_name = @fields[0];
michael@0 61 $state_name =~ s/://; # strip off the colon from the state name.
michael@0 62
michael@0 63 if ($states{$state_name} != 0) {
michael@0 64 print " rbbicst: at line $line-num duplicate definition of state $state_name\n";
michael@0 65 }
michael@0 66 $states{$state_name} = $num_states;
michael@0 67 $stateNames[$num_states] = $state_name;
michael@0 68
michael@0 69 # if the label was the only thing on this line, go on to the next line,
michael@0 70 # otherwise assume that a state definition is on the same line and fall through.
michael@0 71 if (@fields == 1) {
michael@0 72 next line_loop;
michael@0 73 }
michael@0 74 shift @fields; # shift off label field in preparation
michael@0 75 # for handling the rest of the line.
michael@0 76 }
michael@0 77
michael@0 78 #
michael@0 79 # State Transition line.
michael@0 80 # syntax is this,
michael@0 81 # character [n] target-state [^push-state] [function-name]
michael@0 82 # where
michael@0 83 # [something] is an optional something
michael@0 84 # character is either a single quoted character e.g. '['
michael@0 85 # or a name of a character class, e.g. white_space
michael@0 86 #
michael@0 87
michael@0 88 $state_line_num[$num_states] = $line_num; # remember line number with each state
michael@0 89 # so we can make better error messages later.
michael@0 90 #
michael@0 91 # First field, character class or literal character for this transition.
michael@0 92 #
michael@0 93 if ($fields[0] =~ /^'.'$/) {
michael@0 94 # We've got a quoted literal character.
michael@0 95 $state_literal_chars[$num_states] = $fields[0];
michael@0 96 $state_literal_chars[$num_states] =~ s/'//g;
michael@0 97 } else {
michael@0 98 # We've got the name of a character class.
michael@0 99 $state_char_class[$num_states] = $fields[0];
michael@0 100 if ($fields[0] =~ /[\W]/) {
michael@0 101 print " rbbicsts: at line $line_num, bad character literal or character class name.\n";
michael@0 102 print " scanning $fields[0]\n";
michael@0 103 exit(-1);
michael@0 104 }
michael@0 105 }
michael@0 106 shift @fields;
michael@0 107
michael@0 108 #
michael@0 109 # do the 'n' flag
michael@0 110 #
michael@0 111 $state_flag[$num_states] = "FALSE";
michael@0 112 if ($fields[0] eq "n") {
michael@0 113 $state_flag[$num_states] = "TRUE";
michael@0 114 shift @fields;
michael@0 115 }
michael@0 116
michael@0 117 #
michael@0 118 # do the destination state.
michael@0 119 #
michael@0 120 $state_dest_state[$num_states] = $fields[0];
michael@0 121 if ($fields[0] eq "") {
michael@0 122 print " rbbicsts: at line $line_num, destination state missing.\n";
michael@0 123 exit(-1);
michael@0 124 }
michael@0 125 shift @fields;
michael@0 126
michael@0 127 #
michael@0 128 # do the push state, if present.
michael@0 129 #
michael@0 130 if ($fields[0] =~ /^\^/) {
michael@0 131 $fields[0] =~ s/^\^//;
michael@0 132 $state_push_state[$num_states] = $fields[0];
michael@0 133 if ($fields[0] eq "" ) {
michael@0 134 print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";
michael@0 135 exit(-1);
michael@0 136 }
michael@0 137 shift @fields;
michael@0 138 }
michael@0 139
michael@0 140 #
michael@0 141 # Lastly, do the optional action name.
michael@0 142 #
michael@0 143 if ($fields[0] ne "") {
michael@0 144 $state_func_name[$num_states] = $fields[0];
michael@0 145 shift @fields;
michael@0 146 }
michael@0 147
michael@0 148 #
michael@0 149 # There should be no fields left on the line at this point.
michael@0 150 #
michael@0 151 if (@fields > 0) {
michael@0 152 print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";
michael@0 153 print " scanning $fields[0]\n";
michael@0 154 }
michael@0 155 $num_states++;
michael@0 156 }
michael@0 157
michael@0 158 #
michael@0 159 # We've read in the whole file, now go back and output the
michael@0 160 # C source code for the state transition table.
michael@0 161 #
michael@0 162 # We read all states first, before writing anything, so that the state numbers
michael@0 163 # for the destination states are all available to be written.
michael@0 164 #
michael@0 165
michael@0 166 #
michael@0 167 # Make hashes for the names of the character classes and
michael@0 168 # for the names of the actions that appeared.
michael@0 169 #
michael@0 170 for ($state=1; $state < $num_states; $state++) {
michael@0 171 if ($state_char_class[$state] ne "") {
michael@0 172 if ($charClasses{$state_char_class[$state]} == 0) {
michael@0 173 $charClasses{$state_char_class[$state]} = 1;
michael@0 174 }
michael@0 175 }
michael@0 176 if ($state_func_name[$state] eq "") {
michael@0 177 $state_func_name[$state] = "doNOP";
michael@0 178 }
michael@0 179 if ($actions{$state_action_name[$state]} == 0) {
michael@0 180 $actions{$state_func_name[$state]} = 1;
michael@0 181 }
michael@0 182 }
michael@0 183
michael@0 184 #
michael@0 185 # Check that all of the destination states have been defined
michael@0 186 #
michael@0 187 #
michael@0 188 $states{"exit"} = 0; # Predefined state name, terminates state machine.
michael@0 189 for ($state=1; $state<$num_states; $state++) {
michael@0 190 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
michael@0 191 print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
michael@0 192 $errors++;
michael@0 193 }
michael@0 194 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
michael@0 195 print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
michael@0 196 $errors++;
michael@0 197 }
michael@0 198 }
michael@0 199
michael@0 200 die if ($errors>0);
michael@0 201
michael@0 202 print "//---------------------------------------------------------------------------------\n";
michael@0 203 print "//\n";
michael@0 204 print "// Generated Header File. Do not edit by hand.\n";
michael@0 205 print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n";
michael@0 206 print "// It is generated by the Perl script \"regexcst.pl\" from\n";
michael@0 207 print "// the rule parser state definitions file \"regexcst.txt\".\n";
michael@0 208 print "//\n";
michael@0 209 print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
michael@0 210 print "// and others. All rights reserved. \n";
michael@0 211 print "//\n";
michael@0 212 print "//---------------------------------------------------------------------------------\n";
michael@0 213 print "#ifndef RBBIRPT_H\n";
michael@0 214 print "#define RBBIRPT_H\n";
michael@0 215 print "\n";
michael@0 216 print "U_NAMESPACE_BEGIN\n";
michael@0 217
michael@0 218 #
michael@0 219 # Emit the constants for indicies of Unicode Sets
michael@0 220 # Define one constant for each of the character classes encountered.
michael@0 221 # At the same time, store the index corresponding to the set name back into hash.
michael@0 222 #
michael@0 223 print "//\n";
michael@0 224 print "// Character classes for regex pattern scanning.\n";
michael@0 225 print "//\n";
michael@0 226 $i = 128; # State Table values for Unicode char sets range from 128-250.
michael@0 227 # Sets "default", "quoted", etc. get special handling.
michael@0 228 # They have no corresponding UnicodeSet object in the state machine,
michael@0 229 # but are handled by special case code. So we emit no reference
michael@0 230 # to a UnicodeSet object to them here.
michael@0 231 foreach $setName (keys %charClasses) {
michael@0 232 if ($setName eq "default") {
michael@0 233 $charClasses{$setName} = 255;}
michael@0 234 elsif ($setName eq "quoted") {
michael@0 235 $charClasses{$setName} = 254;}
michael@0 236 elsif ($setName eq "eof") {
michael@0 237 $charClasses{$setName} = 253;}
michael@0 238 else {
michael@0 239 # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
michael@0 240 print " static const uint8_t kRuleSet_$setName = $i;\n";
michael@0 241 $charClasses{$setName} = $i;
michael@0 242 $i++;
michael@0 243 }
michael@0 244 }
michael@0 245 print "\n\n";
michael@0 246
michael@0 247 #
michael@0 248 # Emit the enum for the actions to be performed.
michael@0 249 #
michael@0 250 print "enum Regex_PatternParseAction {\n";
michael@0 251 foreach $act (keys %actions) {
michael@0 252 print " $act,\n";
michael@0 253 }
michael@0 254 print " rbbiLastAction};\n\n";
michael@0 255
michael@0 256 #
michael@0 257 # Emit the struct definition for transtion table elements.
michael@0 258 #
michael@0 259 print "//-------------------------------------------------------------------------------\n";
michael@0 260 print "//\n";
michael@0 261 print "// RegexTableEl represents the structure of a row in the transition table\n";
michael@0 262 print "// for the pattern parser state machine.\n";
michael@0 263 print "//-------------------------------------------------------------------------------\n";
michael@0 264 print "struct RegexTableEl {\n";
michael@0 265 print " Regex_PatternParseAction fAction;\n";
michael@0 266 print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";
michael@0 267 print " // 128-255: character class index\n";
michael@0 268 print " uint8_t fNextState; // 0-250: normal next-state numbers\n";
michael@0 269 print " // 255: pop next-state from stack.\n";
michael@0 270 print " uint8_t fPushState;\n";
michael@0 271 print " UBool fNextChar;\n";
michael@0 272 print "};\n\n";
michael@0 273
michael@0 274 #
michael@0 275 # emit the state transition table
michael@0 276 #
michael@0 277 print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
michael@0 278 print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.
michael@0 279 for ($state=1; $state < $num_states; $state++) {
michael@0 280 print " , {$state_func_name[$state],";
michael@0 281 if ($state_literal_chars[$state] ne "") {
michael@0 282 $c = $state_literal_chars[$state];
michael@0 283 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
michael@0 284 }else {
michael@0 285 print " $charClasses{$state_char_class[$state]},";
michael@0 286 }
michael@0 287 print " $states{$state_dest_state[$state]},";
michael@0 288
michael@0 289 # The push-state field is optional. If omitted, fill field with a zero, which flags
michael@0 290 # the state machine that there is no push state.
michael@0 291 if ($state_push_state[$state] eq "") {
michael@0 292 print "0, ";
michael@0 293 } else {
michael@0 294 print " $states{$state_push_state[$state]},";
michael@0 295 }
michael@0 296 print " $state_flag[$state]} ";
michael@0 297
michael@0 298 # Put out a C++ comment showing the number (index) of this state row,
michael@0 299 # and, if this is the first row of the table for this state, the state name.
michael@0 300 print " // $state ";
michael@0 301 if ($stateNames[$state] ne "") {
michael@0 302 print " $stateNames[$state]";
michael@0 303 }
michael@0 304 print "\n";
michael@0 305 };
michael@0 306 print " };\n";
michael@0 307
michael@0 308
michael@0 309 #
michael@0 310 # emit a mapping array from state numbers to state names.
michael@0 311 #
michael@0 312 # This array is used for producing debugging output from the pattern parser.
michael@0 313 #
michael@0 314 print "static const char * const RegexStateNames[] = {";
michael@0 315 for ($state=0; $state<$num_states; $state++) {
michael@0 316 if ($stateNames[$state] ne "") {
michael@0 317 print " \"$stateNames[$state]\",\n";
michael@0 318 } else {
michael@0 319 print " 0,\n";
michael@0 320 }
michael@0 321 }
michael@0 322 print " 0};\n\n";
michael@0 323
michael@0 324 print "U_NAMESPACE_END\n";
michael@0 325 print "#endif\n";
michael@0 326
michael@0 327
michael@0 328

mercurial