michael@0: #!/usr/bin/perl michael@0: # ******************************************************************** michael@0: # * COPYRIGHT: michael@0: # * Copyright (c) 2002-2007, International Business Machines Corporation and michael@0: # * others. All Rights Reserved. michael@0: # ******************************************************************** michael@0: # michael@0: # regexcst.pl michael@0: # Compile the regular expression paser state table data into initialized C data. michael@0: # Usage: michael@0: # cd icu/source/i18n michael@0: # perl regexcst.pl < regexcst.txt > regexcst.h michael@0: # michael@0: # The output file, regexcst.h, is included by some of the .cpp regex michael@0: # implementation files. This perl script is NOT run as part michael@0: # of a normal ICU build. It is run by hand when needed, and the michael@0: # regexcst.h generated file is put back into cvs. michael@0: # michael@0: # See regexcst.txt for a description of the input format for this script. michael@0: # michael@0: # This script is derived from rbbicst.pl, which peforms the same function michael@0: # for the Rule Based Break Iterator Rule Parser. Perhaps they could be michael@0: # merged? michael@0: # michael@0: michael@0: michael@0: $num_states = 1; # Always the state number for the line being compiled. michael@0: $line_num = 0; # The line number in the input file. michael@0: michael@0: $states{"pop"} = 255; # Add the "pop" to the list of defined state names. michael@0: # This prevents any state from being labelled with "pop", michael@0: # and resolves references to "pop" in the next state field. michael@0: michael@0: line_loop: while (<>) { michael@0: chomp(); michael@0: $line = $_; michael@0: @fields = split(); michael@0: $line_num++; michael@0: michael@0: # Remove # comments, which are any fields beginning with a #, plus all michael@0: # that follow on the line. michael@0: for ($i=0; $i<@fields; $i++) { michael@0: if ($fields[$i] =~ /^#/) { michael@0: @fields = @fields[0 .. $i-1]; michael@0: last; michael@0: } michael@0: } michael@0: # ignore blank lines, and those with no fields left after stripping comments.. michael@0: if (@fields == 0) { michael@0: next; michael@0: } michael@0: michael@0: # michael@0: # State Label: handling. michael@0: # Does the first token end with a ":"? If so, it's the name of a state. michael@0: # Put in a hash, together with the current state number, michael@0: # so that we can later look up the number from the name. michael@0: # michael@0: if (@fields[0] =~ /.*:$/) { michael@0: $state_name = @fields[0]; michael@0: $state_name =~ s/://; # strip off the colon from the state name. michael@0: michael@0: if ($states{$state_name} != 0) { michael@0: print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; michael@0: } michael@0: $states{$state_name} = $num_states; michael@0: $stateNames[$num_states] = $state_name; michael@0: michael@0: # if the label was the only thing on this line, go on to the next line, michael@0: # otherwise assume that a state definition is on the same line and fall through. michael@0: if (@fields == 1) { michael@0: next line_loop; michael@0: } michael@0: shift @fields; # shift off label field in preparation michael@0: # for handling the rest of the line. michael@0: } michael@0: michael@0: # michael@0: # State Transition line. michael@0: # syntax is this, michael@0: # character [n] target-state [^push-state] [function-name] michael@0: # where michael@0: # [something] is an optional something michael@0: # character is either a single quoted character e.g. '[' michael@0: # or a name of a character class, e.g. white_space michael@0: # michael@0: michael@0: $state_line_num[$num_states] = $line_num; # remember line number with each state michael@0: # so we can make better error messages later. michael@0: # michael@0: # First field, character class or literal character for this transition. michael@0: # michael@0: if ($fields[0] =~ /^'.'$/) { michael@0: # We've got a quoted literal character. michael@0: $state_literal_chars[$num_states] = $fields[0]; michael@0: $state_literal_chars[$num_states] =~ s/'//g; michael@0: } else { michael@0: # We've got the name of a character class. michael@0: $state_char_class[$num_states] = $fields[0]; michael@0: if ($fields[0] =~ /[\W]/) { michael@0: print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; michael@0: print " scanning $fields[0]\n"; michael@0: exit(-1); michael@0: } michael@0: } michael@0: shift @fields; michael@0: michael@0: # michael@0: # do the 'n' flag michael@0: # michael@0: $state_flag[$num_states] = "FALSE"; michael@0: if ($fields[0] eq "n") { michael@0: $state_flag[$num_states] = "TRUE"; michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # do the destination state. michael@0: # michael@0: $state_dest_state[$num_states] = $fields[0]; michael@0: if ($fields[0] eq "") { michael@0: print " rbbicsts: at line $line_num, destination state missing.\n"; michael@0: exit(-1); michael@0: } michael@0: shift @fields; michael@0: michael@0: # michael@0: # do the push state, if present. michael@0: # michael@0: if ($fields[0] =~ /^\^/) { michael@0: $fields[0] =~ s/^\^//; michael@0: $state_push_state[$num_states] = $fields[0]; michael@0: if ($fields[0] eq "" ) { michael@0: print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; michael@0: exit(-1); michael@0: } michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # Lastly, do the optional action name. michael@0: # michael@0: if ($fields[0] ne "") { michael@0: $state_func_name[$num_states] = $fields[0]; michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # There should be no fields left on the line at this point. michael@0: # michael@0: if (@fields > 0) { michael@0: print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; michael@0: print " scanning $fields[0]\n"; michael@0: } michael@0: $num_states++; michael@0: } michael@0: michael@0: # michael@0: # We've read in the whole file, now go back and output the michael@0: # C source code for the state transition table. michael@0: # michael@0: # We read all states first, before writing anything, so that the state numbers michael@0: # for the destination states are all available to be written. michael@0: # michael@0: michael@0: # michael@0: # Make hashes for the names of the character classes and michael@0: # for the names of the actions that appeared. michael@0: # michael@0: for ($state=1; $state < $num_states; $state++) { michael@0: if ($state_char_class[$state] ne "") { michael@0: if ($charClasses{$state_char_class[$state]} == 0) { michael@0: $charClasses{$state_char_class[$state]} = 1; michael@0: } michael@0: } michael@0: if ($state_func_name[$state] eq "") { michael@0: $state_func_name[$state] = "doNOP"; michael@0: } michael@0: if ($actions{$state_action_name[$state]} == 0) { michael@0: $actions{$state_func_name[$state]} = 1; michael@0: } michael@0: } michael@0: michael@0: # michael@0: # Check that all of the destination states have been defined michael@0: # michael@0: # michael@0: $states{"exit"} = 0; # Predefined state name, terminates state machine. michael@0: for ($state=1; $state<$num_states; $state++) { michael@0: if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { michael@0: print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; michael@0: $errors++; michael@0: } michael@0: if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { michael@0: print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; michael@0: $errors++; michael@0: } michael@0: } michael@0: michael@0: die if ($errors>0); michael@0: michael@0: print "//---------------------------------------------------------------------------------\n"; michael@0: print "//\n"; michael@0: print "// Generated Header File. Do not edit by hand.\n"; michael@0: print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; michael@0: print "// It is generated by the Perl script \"regexcst.pl\" from\n"; michael@0: print "// the rule parser state definitions file \"regexcst.txt\".\n"; michael@0: print "//\n"; michael@0: print "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; michael@0: print "// and others. All rights reserved. \n"; michael@0: print "//\n"; michael@0: print "//---------------------------------------------------------------------------------\n"; michael@0: print "#ifndef RBBIRPT_H\n"; michael@0: print "#define RBBIRPT_H\n"; michael@0: print "\n"; michael@0: print "U_NAMESPACE_BEGIN\n"; michael@0: michael@0: # michael@0: # Emit the constants for indicies of Unicode Sets michael@0: # Define one constant for each of the character classes encountered. michael@0: # At the same time, store the index corresponding to the set name back into hash. michael@0: # michael@0: print "//\n"; michael@0: print "// Character classes for regex pattern scanning.\n"; michael@0: print "//\n"; michael@0: $i = 128; # State Table values for Unicode char sets range from 128-250. michael@0: # Sets "default", "quoted", etc. get special handling. michael@0: # They have no corresponding UnicodeSet object in the state machine, michael@0: # but are handled by special case code. So we emit no reference michael@0: # to a UnicodeSet object to them here. michael@0: foreach $setName (keys %charClasses) { michael@0: if ($setName eq "default") { michael@0: $charClasses{$setName} = 255;} michael@0: elsif ($setName eq "quoted") { michael@0: $charClasses{$setName} = 254;} michael@0: elsif ($setName eq "eof") { michael@0: $charClasses{$setName} = 253;} michael@0: else { michael@0: # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. michael@0: print " static const uint8_t kRuleSet_$setName = $i;\n"; michael@0: $charClasses{$setName} = $i; michael@0: $i++; michael@0: } michael@0: } michael@0: print "\n\n"; michael@0: michael@0: # michael@0: # Emit the enum for the actions to be performed. michael@0: # michael@0: print "enum Regex_PatternParseAction {\n"; michael@0: foreach $act (keys %actions) { michael@0: print " $act,\n"; michael@0: } michael@0: print " rbbiLastAction};\n\n"; michael@0: michael@0: # michael@0: # Emit the struct definition for transtion table elements. michael@0: # michael@0: print "//-------------------------------------------------------------------------------\n"; michael@0: print "//\n"; michael@0: print "// RegexTableEl represents the structure of a row in the transition table\n"; michael@0: print "// for the pattern parser state machine.\n"; michael@0: print "//-------------------------------------------------------------------------------\n"; michael@0: print "struct RegexTableEl {\n"; michael@0: print " Regex_PatternParseAction fAction;\n"; michael@0: print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; michael@0: print " // 128-255: character class index\n"; michael@0: print " uint8_t fNextState; // 0-250: normal next-state numbers\n"; michael@0: print " // 255: pop next-state from stack.\n"; michael@0: print " uint8_t fPushState;\n"; michael@0: print " UBool fNextChar;\n"; michael@0: print "};\n\n"; michael@0: michael@0: # michael@0: # emit the state transition table michael@0: # michael@0: print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; michael@0: print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. michael@0: for ($state=1; $state < $num_states; $state++) { michael@0: print " , {$state_func_name[$state],"; michael@0: if ($state_literal_chars[$state] ne "") { michael@0: $c = $state_literal_chars[$state]; michael@0: printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. michael@0: }else { michael@0: print " $charClasses{$state_char_class[$state]},"; michael@0: } michael@0: print " $states{$state_dest_state[$state]},"; michael@0: michael@0: # The push-state field is optional. If omitted, fill field with a zero, which flags michael@0: # the state machine that there is no push state. michael@0: if ($state_push_state[$state] eq "") { michael@0: print "0, "; michael@0: } else { michael@0: print " $states{$state_push_state[$state]},"; michael@0: } michael@0: print " $state_flag[$state]} "; michael@0: michael@0: # Put out a C++ comment showing the number (index) of this state row, michael@0: # and, if this is the first row of the table for this state, the state name. michael@0: print " // $state "; michael@0: if ($stateNames[$state] ne "") { michael@0: print " $stateNames[$state]"; michael@0: } michael@0: print "\n"; michael@0: }; michael@0: print " };\n"; michael@0: michael@0: michael@0: # michael@0: # emit a mapping array from state numbers to state names. michael@0: # michael@0: # This array is used for producing debugging output from the pattern parser. michael@0: # michael@0: print "static const char * const RegexStateNames[] = {"; michael@0: for ($state=0; $state<$num_states; $state++) { michael@0: if ($stateNames[$state] ne "") { michael@0: print " \"$stateNames[$state]\",\n"; michael@0: } else { michael@0: print " 0,\n"; michael@0: } michael@0: } michael@0: print " 0};\n\n"; michael@0: michael@0: print "U_NAMESPACE_END\n"; michael@0: print "#endif\n"; michael@0: michael@0: michael@0: