1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbicst.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,453 @@ 1.4 +#************************************************************************** 1.5 +# Copyright (C) 2002-2005 International Business Machines Corporation * 1.6 +# and others. All rights reserved. * 1.7 +#************************************************************************** 1.8 +# 1.9 +# rbbicst Compile the RBBI rule paser state table data into initialized C data. 1.10 +# Usage: 1.11 +# cd icu/source/common 1.12 +# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h 1.13 +# perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java 1.14 +# 1.15 +# The output file, rbbrpt.h, is included by some of the .cpp rbbi 1.16 +# implementation files. This perl script is NOT run as part 1.17 +# of a normal ICU build. It is run by hand when needed, and the 1.18 +# rbbirpt.h generated file is put back into cvs. 1.19 +# 1.20 +# See rbbirpt.txt for a description of the input format for this script. 1.21 +# 1.22 + 1.23 +if ($ARGV[0] eq "-j") { 1.24 + $javaOutput = 1; 1.25 + shift @ARGV; 1.26 +} 1.27 + 1.28 + 1.29 +$num_states = 1; # Always the state number for the line being compiled. 1.30 +$line_num = 0; # The line number in the input file. 1.31 + 1.32 +$states{"pop"} = 255; # Add the "pop" to the list of defined state names. 1.33 + # This prevents any state from being labelled with "pop", 1.34 + # and resolves references to "pop" in the next state field. 1.35 + 1.36 +line_loop: while (<>) { 1.37 + chomp(); 1.38 + $line = $_; 1.39 + @fields = split(); 1.40 + $line_num++; 1.41 + 1.42 + # Remove # comments, which are any fields beginning with a #, plus all 1.43 + # that follow on the line. 1.44 + for ($i=0; $i<@fields; $i++) { 1.45 + if ($fields[$i] =~ /^#/) { 1.46 + @fields = @fields[0 .. $i-1]; 1.47 + last; 1.48 + } 1.49 + } 1.50 + # ignore blank lines, and those with no fields left after stripping comments.. 1.51 + if (@fields == 0) { 1.52 + next; 1.53 + } 1.54 + 1.55 + # 1.56 + # State Label: handling. 1.57 + # Does the first token end with a ":"? If so, it's the name of a state. 1.58 + # Put in a hash, together with the current state number, 1.59 + # so that we can later look up the number from the name. 1.60 + # 1.61 + if (@fields[0] =~ /.*:$/) { 1.62 + $state_name = @fields[0]; 1.63 + $state_name =~ s/://; # strip off the colon from the state name. 1.64 + 1.65 + if ($states{$state_name} != 0) { 1.66 + print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 1.67 + } 1.68 + $states{$state_name} = $num_states; 1.69 + $stateNames[$num_states] = $state_name; 1.70 + 1.71 + # if the label was the only thing on this line, go on to the next line, 1.72 + # otherwise assume that a state definition is on the same line and fall through. 1.73 + if (@fields == 1) { 1.74 + next line_loop; 1.75 + } 1.76 + shift @fields; # shift off label field in preparation 1.77 + # for handling the rest of the line. 1.78 + } 1.79 + 1.80 + # 1.81 + # State Transition line. 1.82 + # syntax is this, 1.83 + # character [n] target-state [^push-state] [function-name] 1.84 + # where 1.85 + # [something] is an optional something 1.86 + # character is either a single quoted character e.g. '[' 1.87 + # or a name of a character class, e.g. white_space 1.88 + # 1.89 + 1.90 + $state_line_num[$num_states] = $line_num; # remember line number with each state 1.91 + # so we can make better error messages later. 1.92 + # 1.93 + # First field, character class or literal character for this transition. 1.94 + # 1.95 + if ($fields[0] =~ /^'.'$/) { 1.96 + # We've got a quoted literal character. 1.97 + $state_literal_chars[$num_states] = $fields[0]; 1.98 + $state_literal_chars[$num_states] =~ s/'//g; 1.99 + } else { 1.100 + # We've got the name of a character class. 1.101 + $state_char_class[$num_states] = $fields[0]; 1.102 + if ($fields[0] =~ /[\W]/) { 1.103 + print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 1.104 + print " scanning $fields[0]\n"; 1.105 + exit(-1); 1.106 + } 1.107 + } 1.108 + shift @fields; 1.109 + 1.110 + # 1.111 + # do the 'n' flag 1.112 + # 1.113 + $state_flag[$num_states] = $javaOutput? "false" : "FALSE"; 1.114 + if ($fields[0] eq "n") { 1.115 + $state_flag[$num_states] = $javaOutput? "true": "TRUE"; 1.116 + shift @fields; 1.117 + } 1.118 + 1.119 + # 1.120 + # do the destination state. 1.121 + # 1.122 + $state_dest_state[$num_states] = $fields[0]; 1.123 + if ($fields[0] eq "") { 1.124 + print " rbbicsts: at line $line_num, destination state missing.\n"; 1.125 + exit(-1); 1.126 + } 1.127 + shift @fields; 1.128 + 1.129 + # 1.130 + # do the push state, if present. 1.131 + # 1.132 + if ($fields[0] =~ /^\^/) { 1.133 + $fields[0] =~ s/^\^//; 1.134 + $state_push_state[$num_states] = $fields[0]; 1.135 + if ($fields[0] eq "" ) { 1.136 + print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 1.137 + exit(-1); 1.138 + } 1.139 + shift @fields; 1.140 + } 1.141 + 1.142 + # 1.143 + # Lastly, do the optional action name. 1.144 + # 1.145 + if ($fields[0] ne "") { 1.146 + $state_func_name[$num_states] = $fields[0]; 1.147 + shift @fields; 1.148 + } 1.149 + 1.150 + # 1.151 + # There should be no fields left on the line at this point. 1.152 + # 1.153 + if (@fields > 0) { 1.154 + print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 1.155 + print " scanning $fields[0]\n"; 1.156 + } 1.157 + $num_states++; 1.158 +} 1.159 + 1.160 +# 1.161 +# We've read in the whole file, now go back and output the 1.162 +# C source code for the state transition table. 1.163 +# 1.164 +# We read all states first, before writing anything, so that the state numbers 1.165 +# for the destination states are all available to be written. 1.166 +# 1.167 + 1.168 +# 1.169 +# Make hashes for the names of the character classes and 1.170 +# for the names of the actions that appeared. 1.171 +# 1.172 +for ($state=1; $state < $num_states; $state++) { 1.173 + if ($state_char_class[$state] ne "") { 1.174 + if ($charClasses{$state_char_class[$state]} == 0) { 1.175 + $charClasses{$state_char_class[$state]} = 1; 1.176 + } 1.177 + } 1.178 + if ($state_func_name[$state] eq "") { 1.179 + $state_func_name[$state] = "doNOP"; 1.180 + } 1.181 + if ($actions{$state_action_name[$state]} == 0) { 1.182 + $actions{$state_func_name[$state]} = 1; 1.183 + } 1.184 +} 1.185 + 1.186 +# 1.187 +# Check that all of the destination states have been defined 1.188 +# 1.189 +# 1.190 +$states{"exit"} = 0; # Predefined state name, terminates state machine. 1.191 +for ($state=1; $state<$num_states; $state++) { 1.192 + if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 1.193 + print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 1.194 + $errors++; 1.195 + } 1.196 + if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 1.197 + print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 1.198 + $errors++; 1.199 + } 1.200 +} 1.201 + 1.202 +die if ($errors>0); 1.203 + 1.204 +# 1.205 +# Assign numbers to each of the character classes classes used. 1.206 +# Sets are numbered from 128 - 250 1.207 +# The values 0-127 in the state table are used for matching 1.208 +# individual ASCII characters (the only thing that can appear in the rules.) 1.209 +# The "set" names appearing in the code below (default, etc.) need special 1.210 +# handling because they do not correspond to a normal set of characters, 1.211 +# but trigger special handling by code in the state machine. 1.212 +# 1.213 +$i = 128; 1.214 +foreach $setName (sort keys %charClasses) { 1.215 + if ($setName eq "default") { 1.216 + $charClasses{$setName} = 255;} 1.217 + elsif ($setName eq "escaped") { 1.218 + $charClasses{$setName} = 254;} 1.219 + elsif ($setName eq "escapedP") { 1.220 + $charClasses{$setName} = 253;} 1.221 + elsif ($setName eq "eof") { 1.222 + $charClasses{$setName} = 252;} 1.223 + else { 1.224 + # Normal (single) character class. Number them. 1.225 + $charClasses{$setName} = $i; 1.226 + $i++; 1.227 + } 1.228 +} 1.229 + 1.230 + 1.231 +my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime; 1.232 +$year += 1900; 1.233 + 1.234 +if ($javaOutput) { 1.235 + print "/*\n"; 1.236 + print " *******************************************************************************\n"; 1.237 + print " * Copyright (C) 2003-$year,\n"; 1.238 + print " * International Business Machines Corporation and others. All Rights Reserved.\n"; 1.239 + print " *******************************************************************************\n"; 1.240 + print " */\n"; 1.241 + print " \n"; 1.242 + print "package com.ibm.icu.text;\n"; 1.243 + print " \n"; 1.244 + print "/**\n"; 1.245 + print " * Generated Java File. Do not edit by hand.\n"; 1.246 + print " * This file contains the state table for the ICU Rule Based Break Iterator\n"; 1.247 + print " * rule parser.\n"; 1.248 + print " * It is generated by the Perl script \"rbbicst.pl\" from\n"; 1.249 + print " * the rule parser state definitions file \"rbbirpt.txt\".\n"; 1.250 + print " * \@internal \n"; 1.251 + print " *\n"; 1.252 + print " */\n"; 1.253 + 1.254 + print "class RBBIRuleParseTable\n"; 1.255 + print "{\n"; 1.256 + 1.257 + # 1.258 + # Emit the constants for the actions to be performed. 1.259 + # 1.260 + $n = 1; 1.261 + foreach $act (sort keys %actions) { 1.262 + print " static final short $act = $n;\n"; 1.263 + $n++; 1.264 + } 1.265 + print " \n"; 1.266 + 1.267 + # 1.268 + # Emit constants for char class names 1.269 + # 1.270 + foreach $setName (sort keys %charClasses) { 1.271 + print " static final short kRuleSet_$setName = $charClasses{$setName};\n"; 1.272 + } 1.273 + print "\n\n"; 1.274 + 1.275 + 1.276 + print " static class RBBIRuleTableElement { \n"; 1.277 + print " short fAction; \n"; 1.278 + print " short fCharClass; \n"; 1.279 + print " short fNextState; \n"; 1.280 + print " short fPushState; \n"; 1.281 + print " boolean fNextChar; \n"; 1.282 + print " String fStateName; \n"; 1.283 + print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n"; 1.284 + print " fAction = a; \n"; 1.285 + print " fCharClass = (short)cc; \n"; 1.286 + print " fNextState = (short)ns; \n"; 1.287 + print " fPushState = (short)ps; \n"; 1.288 + print " fNextChar = nc; \n"; 1.289 + print " fStateName = sn; \n"; 1.290 + print " } \n"; 1.291 + print " }; \n"; 1.292 + print " \n"; 1.293 + 1.294 + 1.295 + print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n "; 1.296 + print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0. 1.297 + for ($state=1; $state < $num_states; $state++) { 1.298 + print " , new RBBIRuleTableElement($state_func_name[$state],"; 1.299 + if ($state_literal_chars[$state] ne "") { 1.300 + $c = $state_literal_chars[$state]; 1.301 + print("'$c', "); 1.302 + }else { 1.303 + print " $charClasses{$state_char_class[$state]},"; 1.304 + } 1.305 + print " $states{$state_dest_state[$state]},"; 1.306 + 1.307 + # The push-state field is optional. If omitted, fill field with a zero, which flags 1.308 + # the state machine that there is no push state. 1.309 + if ($state_push_state[$state] eq "") { 1.310 + print "0, "; 1.311 + } else { 1.312 + print " $states{$state_push_state[$state]},"; 1.313 + } 1.314 + print " $state_flag[$state], "; 1.315 + 1.316 + # if this is the first row of the table for this state, put out the state name. 1.317 + if ($stateNames[$state] ne "") { 1.318 + print " \"$stateNames[$state]\") "; 1.319 + } else { 1.320 + print " null ) "; 1.321 + } 1.322 + 1.323 + # Put out a comment showing the number (index) of this state row, 1.324 + print " // $state "; 1.325 + print "\n"; 1.326 + } 1.327 + print " };\n"; 1.328 + 1.329 + print "}; \n"; 1.330 + 1.331 +} 1.332 +else 1.333 +{ 1.334 + # 1.335 + # C++ Output ... 1.336 + # 1.337 + 1.338 + 1.339 + print "//---------------------------------------------------------------------------------\n"; 1.340 + print "//\n"; 1.341 + print "// Generated Header File. Do not edit by hand.\n"; 1.342 + print "// This file contains the state table for the ICU Rule Based Break Iterator\n"; 1.343 + print "// rule parser.\n"; 1.344 + print "// It is generated by the Perl script \"rbbicst.pl\" from\n"; 1.345 + print "// the rule parser state definitions file \"rbbirpt.txt\".\n"; 1.346 + print "//\n"; 1.347 + print "// Copyright (C) 2002-$year International Business Machines Corporation \n"; 1.348 + print "// and others. All rights reserved. \n"; 1.349 + print "//\n"; 1.350 + print "//---------------------------------------------------------------------------------\n"; 1.351 + print "#ifndef RBBIRPT_H\n"; 1.352 + print "#define RBBIRPT_H\n"; 1.353 + print "\n"; 1.354 + print "U_NAMESPACE_BEGIN\n"; 1.355 + 1.356 + # 1.357 + # Emit the constants for indicies of Unicode Sets 1.358 + # Define one constant for each of the character classes encountered. 1.359 + # At the same time, store the index corresponding to the set name back into hash. 1.360 + # 1.361 + print "//\n"; 1.362 + print "// Character classes for RBBI rule scanning.\n"; 1.363 + print "//\n"; 1.364 + foreach $setName (sort keys %charClasses) { 1.365 + if ($charClasses{$setName} < 250) { 1.366 + # Normal character class. 1.367 + print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n"; 1.368 + } 1.369 + } 1.370 + print "\n\n"; 1.371 + 1.372 + # 1.373 + # Emit the enum for the actions to be performed. 1.374 + # 1.375 + print "enum RBBI_RuleParseAction {\n"; 1.376 + foreach $act (sort keys %actions) { 1.377 + print " $act,\n"; 1.378 + } 1.379 + print " rbbiLastAction};\n\n"; 1.380 + 1.381 + # 1.382 + # Emit the struct definition for transtion table elements. 1.383 + # 1.384 + print "//-------------------------------------------------------------------------------\n"; 1.385 + print "//\n"; 1.386 + print "// RBBIRuleTableEl represents the structure of a row in the transition table\n"; 1.387 + print "// for the rule parser state machine.\n"; 1.388 + print "//-------------------------------------------------------------------------------\n"; 1.389 + print "struct RBBIRuleTableEl {\n"; 1.390 + print " RBBI_RuleParseAction fAction;\n"; 1.391 + print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 1.392 + print " // 128-255: character class index\n"; 1.393 + print " uint8_t fNextState; // 0-250: normal next-stat numbers\n"; 1.394 + print " // 255: pop next-state from stack.\n"; 1.395 + print " uint8_t fPushState;\n"; 1.396 + print " UBool fNextChar;\n"; 1.397 + print "};\n\n"; 1.398 + 1.399 + # 1.400 + # emit the state transition table 1.401 + # 1.402 + print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n"; 1.403 + print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 1.404 + for ($state=1; $state < $num_states; $state++) { 1.405 + print " , {$state_func_name[$state],"; 1.406 + if ($state_literal_chars[$state] ne "") { 1.407 + $c = $state_literal_chars[$state]; 1.408 + printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 1.409 + }else { 1.410 + print " $charClasses{$state_char_class[$state]},"; 1.411 + } 1.412 + print " $states{$state_dest_state[$state]},"; 1.413 + 1.414 + # The push-state field is optional. If omitted, fill field with a zero, which flags 1.415 + # the state machine that there is no push state. 1.416 + if ($state_push_state[$state] eq "") { 1.417 + print "0, "; 1.418 + } else { 1.419 + print " $states{$state_push_state[$state]},"; 1.420 + } 1.421 + print " $state_flag[$state]} "; 1.422 + 1.423 + # Put out a C++ comment showing the number (index) of this state row, 1.424 + # and, if this is the first row of the table for this state, the state name. 1.425 + print " // $state "; 1.426 + if ($stateNames[$state] ne "") { 1.427 + print " $stateNames[$state]"; 1.428 + } 1.429 + print "\n"; 1.430 + }; 1.431 + print " };\n"; 1.432 + 1.433 + 1.434 + # 1.435 + # emit a mapping array from state numbers to state names. 1.436 + # 1.437 + # This array is used for producing debugging output from the rule parser. 1.438 + # 1.439 + print "#ifdef RBBI_DEBUG\n"; 1.440 + print "static const char * const RBBIRuleStateNames[] = {"; 1.441 + for ($state=0; $state<$num_states; $state++) { 1.442 + if ($stateNames[$state] ne "") { 1.443 + print " \"$stateNames[$state]\",\n"; 1.444 + } else { 1.445 + print " 0,\n"; 1.446 + } 1.447 + } 1.448 + print " 0};\n"; 1.449 + print "#endif\n\n"; 1.450 + 1.451 + print "U_NAMESPACE_END\n"; 1.452 + print "#endif\n"; 1.453 +} 1.454 + 1.455 + 1.456 +