1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/regexcst.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,328 @@ 1.4 +#!/usr/bin/perl 1.5 +# ******************************************************************** 1.6 +# * COPYRIGHT: 1.7 +# * Copyright (c) 2002-2007, International Business Machines Corporation and 1.8 +# * others. All Rights Reserved. 1.9 +# ******************************************************************** 1.10 +# 1.11 +# regexcst.pl 1.12 +# Compile the regular expression paser state table data into initialized C data. 1.13 +# Usage: 1.14 +# cd icu/source/i18n 1.15 +# perl regexcst.pl < regexcst.txt > regexcst.h 1.16 +# 1.17 +# The output file, regexcst.h, is included by some of the .cpp regex 1.18 +# implementation files. This perl script is NOT run as part 1.19 +# of a normal ICU build. It is run by hand when needed, and the 1.20 +# regexcst.h generated file is put back into cvs. 1.21 +# 1.22 +# See regexcst.txt for a description of the input format for this script. 1.23 +# 1.24 +# This script is derived from rbbicst.pl, which peforms the same function 1.25 +# for the Rule Based Break Iterator Rule Parser. Perhaps they could be 1.26 +# merged? 1.27 +# 1.28 + 1.29 + 1.30 +$num_states = 1; # Always the state number for the line being compiled. 1.31 +$line_num = 0; # The line number in the input file. 1.32 + 1.33 +$states{"pop"} = 255; # Add the "pop" to the list of defined state names. 1.34 + # This prevents any state from being labelled with "pop", 1.35 + # and resolves references to "pop" in the next state field. 1.36 + 1.37 +line_loop: while (<>) { 1.38 + chomp(); 1.39 + $line = $_; 1.40 + @fields = split(); 1.41 + $line_num++; 1.42 + 1.43 + # Remove # comments, which are any fields beginning with a #, plus all 1.44 + # that follow on the line. 1.45 + for ($i=0; $i<@fields; $i++) { 1.46 + if ($fields[$i] =~ /^#/) { 1.47 + @fields = @fields[0 .. $i-1]; 1.48 + last; 1.49 + } 1.50 + } 1.51 + # ignore blank lines, and those with no fields left after stripping comments.. 1.52 + if (@fields == 0) { 1.53 + next; 1.54 + } 1.55 + 1.56 + # 1.57 + # State Label: handling. 1.58 + # Does the first token end with a ":"? If so, it's the name of a state. 1.59 + # Put in a hash, together with the current state number, 1.60 + # so that we can later look up the number from the name. 1.61 + # 1.62 + if (@fields[0] =~ /.*:$/) { 1.63 + $state_name = @fields[0]; 1.64 + $state_name =~ s/://; # strip off the colon from the state name. 1.65 + 1.66 + if ($states{$state_name} != 0) { 1.67 + print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 1.68 + } 1.69 + $states{$state_name} = $num_states; 1.70 + $stateNames[$num_states] = $state_name; 1.71 + 1.72 + # if the label was the only thing on this line, go on to the next line, 1.73 + # otherwise assume that a state definition is on the same line and fall through. 1.74 + if (@fields == 1) { 1.75 + next line_loop; 1.76 + } 1.77 + shift @fields; # shift off label field in preparation 1.78 + # for handling the rest of the line. 1.79 + } 1.80 + 1.81 + # 1.82 + # State Transition line. 1.83 + # syntax is this, 1.84 + # character [n] target-state [^push-state] [function-name] 1.85 + # where 1.86 + # [something] is an optional something 1.87 + # character is either a single quoted character e.g. '[' 1.88 + # or a name of a character class, e.g. white_space 1.89 + # 1.90 + 1.91 + $state_line_num[$num_states] = $line_num; # remember line number with each state 1.92 + # so we can make better error messages later. 1.93 + # 1.94 + # First field, character class or literal character for this transition. 1.95 + # 1.96 + if ($fields[0] =~ /^'.'$/) { 1.97 + # We've got a quoted literal character. 1.98 + $state_literal_chars[$num_states] = $fields[0]; 1.99 + $state_literal_chars[$num_states] =~ s/'//g; 1.100 + } else { 1.101 + # We've got the name of a character class. 1.102 + $state_char_class[$num_states] = $fields[0]; 1.103 + if ($fields[0] =~ /[\W]/) { 1.104 + print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 1.105 + print " scanning $fields[0]\n"; 1.106 + exit(-1); 1.107 + } 1.108 + } 1.109 + shift @fields; 1.110 + 1.111 + # 1.112 + # do the 'n' flag 1.113 + # 1.114 + $state_flag[$num_states] = "FALSE"; 1.115 + if ($fields[0] eq "n") { 1.116 + $state_flag[$num_states] = "TRUE"; 1.117 + shift @fields; 1.118 + } 1.119 + 1.120 + # 1.121 + # do the destination state. 1.122 + # 1.123 + $state_dest_state[$num_states] = $fields[0]; 1.124 + if ($fields[0] eq "") { 1.125 + print " rbbicsts: at line $line_num, destination state missing.\n"; 1.126 + exit(-1); 1.127 + } 1.128 + shift @fields; 1.129 + 1.130 + # 1.131 + # do the push state, if present. 1.132 + # 1.133 + if ($fields[0] =~ /^\^/) { 1.134 + $fields[0] =~ s/^\^//; 1.135 + $state_push_state[$num_states] = $fields[0]; 1.136 + if ($fields[0] eq "" ) { 1.137 + print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 1.138 + exit(-1); 1.139 + } 1.140 + shift @fields; 1.141 + } 1.142 + 1.143 + # 1.144 + # Lastly, do the optional action name. 1.145 + # 1.146 + if ($fields[0] ne "") { 1.147 + $state_func_name[$num_states] = $fields[0]; 1.148 + shift @fields; 1.149 + } 1.150 + 1.151 + # 1.152 + # There should be no fields left on the line at this point. 1.153 + # 1.154 + if (@fields > 0) { 1.155 + print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 1.156 + print " scanning $fields[0]\n"; 1.157 + } 1.158 + $num_states++; 1.159 +} 1.160 + 1.161 +# 1.162 +# We've read in the whole file, now go back and output the 1.163 +# C source code for the state transition table. 1.164 +# 1.165 +# We read all states first, before writing anything, so that the state numbers 1.166 +# for the destination states are all available to be written. 1.167 +# 1.168 + 1.169 +# 1.170 +# Make hashes for the names of the character classes and 1.171 +# for the names of the actions that appeared. 1.172 +# 1.173 +for ($state=1; $state < $num_states; $state++) { 1.174 + if ($state_char_class[$state] ne "") { 1.175 + if ($charClasses{$state_char_class[$state]} == 0) { 1.176 + $charClasses{$state_char_class[$state]} = 1; 1.177 + } 1.178 + } 1.179 + if ($state_func_name[$state] eq "") { 1.180 + $state_func_name[$state] = "doNOP"; 1.181 + } 1.182 + if ($actions{$state_action_name[$state]} == 0) { 1.183 + $actions{$state_func_name[$state]} = 1; 1.184 + } 1.185 +} 1.186 + 1.187 +# 1.188 +# Check that all of the destination states have been defined 1.189 +# 1.190 +# 1.191 +$states{"exit"} = 0; # Predefined state name, terminates state machine. 1.192 +for ($state=1; $state<$num_states; $state++) { 1.193 + if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 1.194 + print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 1.195 + $errors++; 1.196 + } 1.197 + if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 1.198 + print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 1.199 + $errors++; 1.200 + } 1.201 +} 1.202 + 1.203 +die if ($errors>0); 1.204 + 1.205 +print "//---------------------------------------------------------------------------------\n"; 1.206 +print "//\n"; 1.207 +print "// Generated Header File. Do not edit by hand.\n"; 1.208 +print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; 1.209 +print "// It is generated by the Perl script \"regexcst.pl\" from\n"; 1.210 +print "// the rule parser state definitions file \"regexcst.txt\".\n"; 1.211 +print "//\n"; 1.212 +print "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; 1.213 +print "// and others. All rights reserved. \n"; 1.214 +print "//\n"; 1.215 +print "//---------------------------------------------------------------------------------\n"; 1.216 +print "#ifndef RBBIRPT_H\n"; 1.217 +print "#define RBBIRPT_H\n"; 1.218 +print "\n"; 1.219 +print "U_NAMESPACE_BEGIN\n"; 1.220 + 1.221 +# 1.222 +# Emit the constants for indicies of Unicode Sets 1.223 +# Define one constant for each of the character classes encountered. 1.224 +# At the same time, store the index corresponding to the set name back into hash. 1.225 +# 1.226 +print "//\n"; 1.227 +print "// Character classes for regex pattern scanning.\n"; 1.228 +print "//\n"; 1.229 +$i = 128; # State Table values for Unicode char sets range from 128-250. 1.230 + # Sets "default", "quoted", etc. get special handling. 1.231 + # They have no corresponding UnicodeSet object in the state machine, 1.232 + # but are handled by special case code. So we emit no reference 1.233 + # to a UnicodeSet object to them here. 1.234 +foreach $setName (keys %charClasses) { 1.235 + if ($setName eq "default") { 1.236 + $charClasses{$setName} = 255;} 1.237 + elsif ($setName eq "quoted") { 1.238 + $charClasses{$setName} = 254;} 1.239 + elsif ($setName eq "eof") { 1.240 + $charClasses{$setName} = 253;} 1.241 + else { 1.242 + # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. 1.243 + print " static const uint8_t kRuleSet_$setName = $i;\n"; 1.244 + $charClasses{$setName} = $i; 1.245 + $i++; 1.246 + } 1.247 +} 1.248 +print "\n\n"; 1.249 + 1.250 +# 1.251 +# Emit the enum for the actions to be performed. 1.252 +# 1.253 +print "enum Regex_PatternParseAction {\n"; 1.254 +foreach $act (keys %actions) { 1.255 + print " $act,\n"; 1.256 +} 1.257 +print " rbbiLastAction};\n\n"; 1.258 + 1.259 +# 1.260 +# Emit the struct definition for transtion table elements. 1.261 +# 1.262 +print "//-------------------------------------------------------------------------------\n"; 1.263 +print "//\n"; 1.264 +print "// RegexTableEl represents the structure of a row in the transition table\n"; 1.265 +print "// for the pattern parser state machine.\n"; 1.266 +print "//-------------------------------------------------------------------------------\n"; 1.267 +print "struct RegexTableEl {\n"; 1.268 +print " Regex_PatternParseAction fAction;\n"; 1.269 +print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 1.270 +print " // 128-255: character class index\n"; 1.271 +print " uint8_t fNextState; // 0-250: normal next-state numbers\n"; 1.272 +print " // 255: pop next-state from stack.\n"; 1.273 +print " uint8_t fPushState;\n"; 1.274 +print " UBool fNextChar;\n"; 1.275 +print "};\n\n"; 1.276 + 1.277 +# 1.278 +# emit the state transition table 1.279 +# 1.280 +print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; 1.281 +print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 1.282 +for ($state=1; $state < $num_states; $state++) { 1.283 + print " , {$state_func_name[$state],"; 1.284 + if ($state_literal_chars[$state] ne "") { 1.285 + $c = $state_literal_chars[$state]; 1.286 + printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 1.287 + }else { 1.288 + print " $charClasses{$state_char_class[$state]},"; 1.289 + } 1.290 + print " $states{$state_dest_state[$state]},"; 1.291 + 1.292 + # The push-state field is optional. If omitted, fill field with a zero, which flags 1.293 + # the state machine that there is no push state. 1.294 + if ($state_push_state[$state] eq "") { 1.295 + print "0, "; 1.296 + } else { 1.297 + print " $states{$state_push_state[$state]},"; 1.298 + } 1.299 + print " $state_flag[$state]} "; 1.300 + 1.301 + # Put out a C++ comment showing the number (index) of this state row, 1.302 + # and, if this is the first row of the table for this state, the state name. 1.303 + print " // $state "; 1.304 + if ($stateNames[$state] ne "") { 1.305 + print " $stateNames[$state]"; 1.306 + } 1.307 + print "\n"; 1.308 +}; 1.309 +print " };\n"; 1.310 + 1.311 + 1.312 +# 1.313 +# emit a mapping array from state numbers to state names. 1.314 +# 1.315 +# This array is used for producing debugging output from the pattern parser. 1.316 +# 1.317 +print "static const char * const RegexStateNames[] = {"; 1.318 +for ($state=0; $state<$num_states; $state++) { 1.319 + if ($stateNames[$state] ne "") { 1.320 + print " \"$stateNames[$state]\",\n"; 1.321 + } else { 1.322 + print " 0,\n"; 1.323 + } 1.324 +} 1.325 +print " 0};\n\n"; 1.326 + 1.327 +print "U_NAMESPACE_END\n"; 1.328 +print "#endif\n"; 1.329 + 1.330 + 1.331 +