michael@0: #************************************************************************** michael@0: # Copyright (C) 2002-2005 International Business Machines Corporation * michael@0: # and others. All rights reserved. * michael@0: #************************************************************************** michael@0: # michael@0: # rbbicst Compile the RBBI rule paser state table data into initialized C data. michael@0: # Usage: michael@0: # cd icu/source/common michael@0: # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h michael@0: # perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java michael@0: # michael@0: # The output file, rbbrpt.h, is included by some of the .cpp rbbi michael@0: # implementation files. This perl script is NOT run as part michael@0: # of a normal ICU build. It is run by hand when needed, and the michael@0: # rbbirpt.h generated file is put back into cvs. michael@0: # michael@0: # See rbbirpt.txt for a description of the input format for this script. michael@0: # michael@0: michael@0: if ($ARGV[0] eq "-j") { michael@0: $javaOutput = 1; michael@0: shift @ARGV; michael@0: } michael@0: michael@0: michael@0: $num_states = 1; # Always the state number for the line being compiled. michael@0: $line_num = 0; # The line number in the input file. michael@0: michael@0: $states{"pop"} = 255; # Add the "pop" to the list of defined state names. michael@0: # This prevents any state from being labelled with "pop", michael@0: # and resolves references to "pop" in the next state field. michael@0: michael@0: line_loop: while (<>) { michael@0: chomp(); michael@0: $line = $_; michael@0: @fields = split(); michael@0: $line_num++; michael@0: michael@0: # Remove # comments, which are any fields beginning with a #, plus all michael@0: # that follow on the line. michael@0: for ($i=0; $i<@fields; $i++) { michael@0: if ($fields[$i] =~ /^#/) { michael@0: @fields = @fields[0 .. $i-1]; michael@0: last; michael@0: } michael@0: } michael@0: # ignore blank lines, and those with no fields left after stripping comments.. michael@0: if (@fields == 0) { michael@0: next; michael@0: } michael@0: michael@0: # michael@0: # State Label: handling. michael@0: # Does the first token end with a ":"? If so, it's the name of a state. michael@0: # Put in a hash, together with the current state number, michael@0: # so that we can later look up the number from the name. michael@0: # michael@0: if (@fields[0] =~ /.*:$/) { michael@0: $state_name = @fields[0]; michael@0: $state_name =~ s/://; # strip off the colon from the state name. michael@0: michael@0: if ($states{$state_name} != 0) { michael@0: print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; michael@0: } michael@0: $states{$state_name} = $num_states; michael@0: $stateNames[$num_states] = $state_name; michael@0: michael@0: # if the label was the only thing on this line, go on to the next line, michael@0: # otherwise assume that a state definition is on the same line and fall through. michael@0: if (@fields == 1) { michael@0: next line_loop; michael@0: } michael@0: shift @fields; # shift off label field in preparation michael@0: # for handling the rest of the line. michael@0: } michael@0: michael@0: # michael@0: # State Transition line. michael@0: # syntax is this, michael@0: # character [n] target-state [^push-state] [function-name] michael@0: # where michael@0: # [something] is an optional something michael@0: # character is either a single quoted character e.g. '[' michael@0: # or a name of a character class, e.g. white_space michael@0: # michael@0: michael@0: $state_line_num[$num_states] = $line_num; # remember line number with each state michael@0: # so we can make better error messages later. michael@0: # michael@0: # First field, character class or literal character for this transition. michael@0: # michael@0: if ($fields[0] =~ /^'.'$/) { michael@0: # We've got a quoted literal character. michael@0: $state_literal_chars[$num_states] = $fields[0]; michael@0: $state_literal_chars[$num_states] =~ s/'//g; michael@0: } else { michael@0: # We've got the name of a character class. michael@0: $state_char_class[$num_states] = $fields[0]; michael@0: if ($fields[0] =~ /[\W]/) { michael@0: print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; michael@0: print " scanning $fields[0]\n"; michael@0: exit(-1); michael@0: } michael@0: } michael@0: shift @fields; michael@0: michael@0: # michael@0: # do the 'n' flag michael@0: # michael@0: $state_flag[$num_states] = $javaOutput? "false" : "FALSE"; michael@0: if ($fields[0] eq "n") { michael@0: $state_flag[$num_states] = $javaOutput? "true": "TRUE"; michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # do the destination state. michael@0: # michael@0: $state_dest_state[$num_states] = $fields[0]; michael@0: if ($fields[0] eq "") { michael@0: print " rbbicsts: at line $line_num, destination state missing.\n"; michael@0: exit(-1); michael@0: } michael@0: shift @fields; michael@0: michael@0: # michael@0: # do the push state, if present. michael@0: # michael@0: if ($fields[0] =~ /^\^/) { michael@0: $fields[0] =~ s/^\^//; michael@0: $state_push_state[$num_states] = $fields[0]; michael@0: if ($fields[0] eq "" ) { michael@0: print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; michael@0: exit(-1); michael@0: } michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # Lastly, do the optional action name. michael@0: # michael@0: if ($fields[0] ne "") { michael@0: $state_func_name[$num_states] = $fields[0]; michael@0: shift @fields; michael@0: } michael@0: michael@0: # michael@0: # There should be no fields left on the line at this point. michael@0: # michael@0: if (@fields > 0) { michael@0: print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; michael@0: print " scanning $fields[0]\n"; michael@0: } michael@0: $num_states++; michael@0: } michael@0: michael@0: # michael@0: # We've read in the whole file, now go back and output the michael@0: # C source code for the state transition table. michael@0: # michael@0: # We read all states first, before writing anything, so that the state numbers michael@0: # for the destination states are all available to be written. michael@0: # michael@0: michael@0: # michael@0: # Make hashes for the names of the character classes and michael@0: # for the names of the actions that appeared. michael@0: # michael@0: for ($state=1; $state < $num_states; $state++) { michael@0: if ($state_char_class[$state] ne "") { michael@0: if ($charClasses{$state_char_class[$state]} == 0) { michael@0: $charClasses{$state_char_class[$state]} = 1; michael@0: } michael@0: } michael@0: if ($state_func_name[$state] eq "") { michael@0: $state_func_name[$state] = "doNOP"; michael@0: } michael@0: if ($actions{$state_action_name[$state]} == 0) { michael@0: $actions{$state_func_name[$state]} = 1; michael@0: } michael@0: } michael@0: michael@0: # michael@0: # Check that all of the destination states have been defined michael@0: # michael@0: # michael@0: $states{"exit"} = 0; # Predefined state name, terminates state machine. michael@0: for ($state=1; $state<$num_states; $state++) { michael@0: if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { michael@0: print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; michael@0: $errors++; michael@0: } michael@0: if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { michael@0: print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; michael@0: $errors++; michael@0: } michael@0: } michael@0: michael@0: die if ($errors>0); michael@0: michael@0: # michael@0: # Assign numbers to each of the character classes classes used. michael@0: # Sets are numbered from 128 - 250 michael@0: # The values 0-127 in the state table are used for matching michael@0: # individual ASCII characters (the only thing that can appear in the rules.) michael@0: # The "set" names appearing in the code below (default, etc.) need special michael@0: # handling because they do not correspond to a normal set of characters, michael@0: # but trigger special handling by code in the state machine. michael@0: # michael@0: $i = 128; michael@0: foreach $setName (sort keys %charClasses) { michael@0: if ($setName eq "default") { michael@0: $charClasses{$setName} = 255;} michael@0: elsif ($setName eq "escaped") { michael@0: $charClasses{$setName} = 254;} michael@0: elsif ($setName eq "escapedP") { michael@0: $charClasses{$setName} = 253;} michael@0: elsif ($setName eq "eof") { michael@0: $charClasses{$setName} = 252;} michael@0: else { michael@0: # Normal (single) character class. Number them. michael@0: $charClasses{$setName} = $i; michael@0: $i++; michael@0: } michael@0: } michael@0: michael@0: michael@0: my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime; michael@0: $year += 1900; michael@0: michael@0: if ($javaOutput) { michael@0: print "/*\n"; michael@0: print " *******************************************************************************\n"; michael@0: print " * Copyright (C) 2003-$year,\n"; michael@0: print " * International Business Machines Corporation and others. All Rights Reserved.\n"; michael@0: print " *******************************************************************************\n"; michael@0: print " */\n"; michael@0: print " \n"; michael@0: print "package com.ibm.icu.text;\n"; michael@0: print " \n"; michael@0: print "/**\n"; michael@0: print " * Generated Java File. Do not edit by hand.\n"; michael@0: print " * This file contains the state table for the ICU Rule Based Break Iterator\n"; michael@0: print " * rule parser.\n"; michael@0: print " * It is generated by the Perl script \"rbbicst.pl\" from\n"; michael@0: print " * the rule parser state definitions file \"rbbirpt.txt\".\n"; michael@0: print " * \@internal \n"; michael@0: print " *\n"; michael@0: print " */\n"; michael@0: michael@0: print "class RBBIRuleParseTable\n"; michael@0: print "{\n"; michael@0: michael@0: # michael@0: # Emit the constants for the actions to be performed. michael@0: # michael@0: $n = 1; michael@0: foreach $act (sort keys %actions) { michael@0: print " static final short $act = $n;\n"; michael@0: $n++; michael@0: } michael@0: print " \n"; michael@0: michael@0: # michael@0: # Emit constants for char class names michael@0: # michael@0: foreach $setName (sort keys %charClasses) { michael@0: print " static final short kRuleSet_$setName = $charClasses{$setName};\n"; michael@0: } michael@0: print "\n\n"; michael@0: michael@0: michael@0: print " static class RBBIRuleTableElement { \n"; michael@0: print " short fAction; \n"; michael@0: print " short fCharClass; \n"; michael@0: print " short fNextState; \n"; michael@0: print " short fPushState; \n"; michael@0: print " boolean fNextChar; \n"; michael@0: print " String fStateName; \n"; michael@0: print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n"; michael@0: print " fAction = a; \n"; michael@0: print " fCharClass = (short)cc; \n"; michael@0: print " fNextState = (short)ns; \n"; michael@0: print " fPushState = (short)ps; \n"; michael@0: print " fNextChar = nc; \n"; michael@0: print " fStateName = sn; \n"; michael@0: print " } \n"; michael@0: print " }; \n"; michael@0: print " \n"; michael@0: michael@0: michael@0: print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n "; michael@0: print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0. michael@0: for ($state=1; $state < $num_states; $state++) { michael@0: print " , new RBBIRuleTableElement($state_func_name[$state],"; michael@0: if ($state_literal_chars[$state] ne "") { michael@0: $c = $state_literal_chars[$state]; michael@0: print("'$c', "); michael@0: }else { michael@0: print " $charClasses{$state_char_class[$state]},"; michael@0: } michael@0: print " $states{$state_dest_state[$state]},"; michael@0: michael@0: # The push-state field is optional. If omitted, fill field with a zero, which flags michael@0: # the state machine that there is no push state. michael@0: if ($state_push_state[$state] eq "") { michael@0: print "0, "; michael@0: } else { michael@0: print " $states{$state_push_state[$state]},"; michael@0: } michael@0: print " $state_flag[$state], "; michael@0: michael@0: # if this is the first row of the table for this state, put out the state name. michael@0: if ($stateNames[$state] ne "") { michael@0: print " \"$stateNames[$state]\") "; michael@0: } else { michael@0: print " null ) "; michael@0: } michael@0: michael@0: # Put out a comment showing the number (index) of this state row, michael@0: print " // $state "; michael@0: print "\n"; michael@0: } michael@0: print " };\n"; michael@0: michael@0: print "}; \n"; michael@0: michael@0: } michael@0: else michael@0: { michael@0: # michael@0: # C++ Output ... michael@0: # michael@0: michael@0: michael@0: print "//---------------------------------------------------------------------------------\n"; michael@0: print "//\n"; michael@0: print "// Generated Header File. Do not edit by hand.\n"; michael@0: print "// This file contains the state table for the ICU Rule Based Break Iterator\n"; michael@0: print "// rule parser.\n"; michael@0: print "// It is generated by the Perl script \"rbbicst.pl\" from\n"; michael@0: print "// the rule parser state definitions file \"rbbirpt.txt\".\n"; michael@0: print "//\n"; michael@0: print "// Copyright (C) 2002-$year International Business Machines Corporation \n"; michael@0: print "// and others. All rights reserved. \n"; michael@0: print "//\n"; michael@0: print "//---------------------------------------------------------------------------------\n"; michael@0: print "#ifndef RBBIRPT_H\n"; michael@0: print "#define RBBIRPT_H\n"; michael@0: print "\n"; michael@0: print "U_NAMESPACE_BEGIN\n"; michael@0: michael@0: # michael@0: # Emit the constants for indicies of Unicode Sets michael@0: # Define one constant for each of the character classes encountered. michael@0: # At the same time, store the index corresponding to the set name back into hash. michael@0: # michael@0: print "//\n"; michael@0: print "// Character classes for RBBI rule scanning.\n"; michael@0: print "//\n"; michael@0: foreach $setName (sort keys %charClasses) { michael@0: if ($charClasses{$setName} < 250) { michael@0: # Normal character class. michael@0: print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n"; michael@0: } michael@0: } michael@0: print "\n\n"; michael@0: michael@0: # michael@0: # Emit the enum for the actions to be performed. michael@0: # michael@0: print "enum RBBI_RuleParseAction {\n"; michael@0: foreach $act (sort keys %actions) { michael@0: print " $act,\n"; michael@0: } michael@0: print " rbbiLastAction};\n\n"; michael@0: michael@0: # michael@0: # Emit the struct definition for transtion table elements. michael@0: # michael@0: print "//-------------------------------------------------------------------------------\n"; michael@0: print "//\n"; michael@0: print "// RBBIRuleTableEl represents the structure of a row in the transition table\n"; michael@0: print "// for the rule parser state machine.\n"; michael@0: print "//-------------------------------------------------------------------------------\n"; michael@0: print "struct RBBIRuleTableEl {\n"; michael@0: print " RBBI_RuleParseAction fAction;\n"; michael@0: print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; michael@0: print " // 128-255: character class index\n"; michael@0: print " uint8_t fNextState; // 0-250: normal next-stat numbers\n"; michael@0: print " // 255: pop next-state from stack.\n"; michael@0: print " uint8_t fPushState;\n"; michael@0: print " UBool fNextChar;\n"; michael@0: print "};\n\n"; michael@0: michael@0: # michael@0: # emit the state transition table michael@0: # michael@0: print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n"; michael@0: print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. michael@0: for ($state=1; $state < $num_states; $state++) { michael@0: print " , {$state_func_name[$state],"; michael@0: if ($state_literal_chars[$state] ne "") { michael@0: $c = $state_literal_chars[$state]; michael@0: printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. michael@0: }else { michael@0: print " $charClasses{$state_char_class[$state]},"; michael@0: } michael@0: print " $states{$state_dest_state[$state]},"; michael@0: michael@0: # The push-state field is optional. If omitted, fill field with a zero, which flags michael@0: # the state machine that there is no push state. michael@0: if ($state_push_state[$state] eq "") { michael@0: print "0, "; michael@0: } else { michael@0: print " $states{$state_push_state[$state]},"; michael@0: } michael@0: print " $state_flag[$state]} "; michael@0: michael@0: # Put out a C++ comment showing the number (index) of this state row, michael@0: # and, if this is the first row of the table for this state, the state name. michael@0: print " // $state "; michael@0: if ($stateNames[$state] ne "") { michael@0: print " $stateNames[$state]"; michael@0: } michael@0: print "\n"; michael@0: }; michael@0: print " };\n"; michael@0: michael@0: michael@0: # michael@0: # emit a mapping array from state numbers to state names. michael@0: # michael@0: # This array is used for producing debugging output from the rule parser. michael@0: # michael@0: print "#ifdef RBBI_DEBUG\n"; michael@0: print "static const char * const RBBIRuleStateNames[] = {"; michael@0: for ($state=0; $state<$num_states; $state++) { michael@0: if ($stateNames[$state] ne "") { michael@0: print " \"$stateNames[$state]\",\n"; michael@0: } else { michael@0: print " 0,\n"; michael@0: } michael@0: } michael@0: print " 0};\n"; michael@0: print "#endif\n\n"; michael@0: michael@0: print "U_NAMESPACE_END\n"; michael@0: print "#endif\n"; michael@0: } michael@0: michael@0: michael@0: