1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/tools/gentransliterate.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,477 @@ 1.4 +#!/usr/bin/perl 1.5 +# 1.6 +# This Source Code Form is subject to the terms of the Mozilla Public 1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.9 + 1.10 +$header = <<END_OF_HEADER; 1.11 +# This Source Code Form is subject to the terms of the Mozilla Public 1.12 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.13 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.14 + 1.15 +# 1.16 +# THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl 1.17 +# PLEASE DO NOT MODIFY THIS FILE BY HAND 1.18 +# 1.19 +entity.list.name=transliterate 1.20 +entity.169=(c) 1.21 +# 1.22 +# 1.23 +# Here are the windows-1252 characters from the range 0x80 - 0x9F 1.24 +# 1.25 +END_OF_HEADER 1.26 + 1.27 +$handcoded = <<END_OF_HANDCODED; 1.28 +# EURO SIGN 1.29 +entity.8364=EUR 1.30 +# SINGLE LOW-9 QUOTATION MARK 1.31 +entity.8218=, 1.32 +# LATIN SMALL LETTER F WITH HOOK 1.33 +entity.402=f 1.34 +# DOUBLE LOW-9 QUOTATION MARK 1.35 +entity.8222=" 1.36 +# DAGGER 1.37 +entity.8224=+ 1.38 +# DOUBLE DAGGER 1.39 +entity.8225=++ 1.40 +# MODIFIER LETTER CIRCUMFLEX ACCENT 1.41 +entity.710=^ 1.42 +# PER MILLE SIGN 1.43 +entity.8240=0/00 1.44 +# SINGLE LEFT-POINTING ANGLE QUOTATION MARK 1.45 +entity.8249=< 1.46 +# LATIN CAPITAL LIGATURE OE 1.47 +entity.338=OE 1.48 +# LEFT SINGLE QUOTATION MARK 1.49 +entity.8216=' 1.50 +# RIGHT SINGLE QUOTATION MARK 1.51 +entity.8217=' 1.52 +# LEFT DOUBLE QUOTATION MARK 1.53 +entity.8220=" 1.54 +# RIGHT DOUBLE QUOTATION MARK 1.55 +entity.8221=" 1.56 +# BULLET 1.57 +entity.8226=. 1.58 +# EN DASH 1.59 +entity.8211=-- 1.60 +# EM DASH 1.61 +entity.8212=--- 1.62 +# SMALL TILDE 1.63 +entity.732=~ 1.64 +# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 1.65 +entity.8250=> 1.66 +# LATIN SMALL LIGATURE OE 1.67 +entity.339=oe 1.68 +# U+2000 EN QUAD 1.69 +entity.8192=\\u0020 1.70 +# U+2001 EM QUAD 1.71 +entity.8193=\\u0020 1.72 +# U+2010 HYPHEN 1.73 +entity.8208=- 1.74 +# U+2011 NON-BREAKING HYPHEN 1.75 +entity.8209=- 1.76 +# U+2012 FIGURE DASH 1.77 +entity.8210=- 1.78 +# U+2015 HORIZONTAL BAR 1.79 +entity.8213=-- 1.80 +# U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma) 1.81 +entity.8203= 1.82 +# U+2061, ApplyFunction, character showing function application in presentation tagging 1.83 +entity.8289= 1.84 +# U+2062, InvisibleTimes, marks multiplication when it is understood without a mark 1.85 +entity.8290= 1.86 +# U+2146, DifferentialD, d for use in differentials, e.g., within integrals 1.87 +entity.8518=d 1.88 +# U+2212, MINUS SIGN, official Unicode minus sign 1.89 +entity.8722=- 1.90 +# Hebrew punctuation 1.91 +# U+05BE HEBREW PUNCTUATION MAQAF 1.92 +entity.1470=- 1.93 +# U+05C0 HEBREW PUNCTUATION PASEQ 1.94 +entity.1472=| 1.95 +# U+05C3 HEBREW PUNCTUATION SOF PASUQ 1.96 +entity.1475=: 1.97 +# U+05F3 HEBREW PUNCTUATION GERESH 1.98 +entity.1523=' 1.99 +# U+05F4 HEBREW PUNCTUATION GERSHAYIM 1.100 +entity.1524=" 1.101 +## 1.102 +## End of hand coded section 1.103 +## Below are generated from the unicode character database 1.104 +## 1.105 +END_OF_HANDCODED 1.106 + 1.107 +@table = (); 1.108 +sub FromLatinComment 1.109 +{ 1.110 + my ($cmt) = (@_); 1.111 + $char = ""; 1.112 + if($cmt =~ /PRECEDED BY APOSTROPHE/) { 1.113 + $char = "\'"; 1.114 + } 1.115 + if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) { 1.116 + $char = $char . $1; 1.117 + } 1.118 + if($cmt =~ /SMALL LETTER ([A-Z]*)/) { 1.119 + $char = $char . lc($1); 1.120 + } 1.121 + @f = split(/ / , $cmt); 1.122 + while($item = shift @f) { 1.123 + if($item eq "DOT") { 1.124 + $char .= "."; 1.125 + } elsif ($item eq "DIAERESIS") { 1.126 + $char .= "\""; 1.127 + } elsif ($item eq "BREVE") { 1.128 + $char .= "("; 1.129 + } elsif ($item eq "ACUTE") { 1.130 + $char .= "\'"; 1.131 + } elsif ($item eq "GRAVE") { 1.132 + $char .= "`"; 1.133 + } elsif ($item eq "TILDE") { 1.134 + $char .= "~"; 1.135 + } elsif ($item eq "CARON") { 1.136 + $char .= "("; 1.137 + } elsif ($item eq "HOOK") { 1.138 + $char .= "?"; 1.139 + } elsif ($item eq "CEDILLA") { 1.140 + $char .= ","; 1.141 + } elsif ($item eq "MACRON") { 1.142 + $char .= "-"; 1.143 + } elsif ($item eq "CIRCUMFLEX") { 1.144 + $char .= "^"; 1.145 + } elsif ($item eq "RING") { 1.146 + $char .= "*"; 1.147 + } elsif ($item eq "OGONEK") { 1.148 + $char .= ";"; 1.149 + } elsif ($item eq "LINE") { 1.150 + $char .= "_"; 1.151 + } elsif ($item eq "COMMA") { 1.152 + $char .= ","; 1.153 + } elsif ($item eq "STROKE") { 1.154 + $char .= "/"; 1.155 + } elsif ($item eq "HORN") { 1.156 + $char .= "+"; 1.157 + } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) { 1.158 + # ignore 1.159 + } else { 1.160 + #print "AAAA $item\n"; 1.161 + } 1.162 + } 1.163 + 1.164 + return $char; 1.165 +} 1.166 +sub warning 1.167 +{ 1.168 + my ($warning) = (@_); 1.169 + print "WARNING: $warning \n"; 1.170 +} 1.171 +sub doutput 1.172 +{ 1.173 + my ($u, $cmt, $udec, $str) = (@_); 1.174 + # don't print out comments - for debugging purposes only 1.175 + # print "# U+$u $cmt\n"; 1.176 + print "entity.$udec=$str\n"; 1.177 +} 1.178 +sub output 1.179 +{ 1.180 + my ($u, $cmt, $udec, $str) = (@_); 1.181 + if(decomposeIntoNonASCII($str)) { 1.182 + if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) { 1.183 + $str = FromLatinComment($cmt); 1.184 + output($u,$cmt,$udec,$str); 1.185 + } 1.186 + } else { 1.187 + # don't print out comments - for debugging purposes only 1.188 + # print OUT "# U+$u $cmt\n"; 1.189 + print OUT "entity.$udec=$str\n"; 1.190 + } 1.191 +} 1.192 + 1.193 +sub decomposeIntoNonASCII 1.194 +{ 1.195 + my ($dec) = (@_); 1.196 + return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/; 1.197 +} 1.198 + 1.199 +sub foldcombining 1.200 +{ 1.201 + my ($dec) = (@_); 1.202 + $grave = "0060"; 1.203 + $acute = "0027"; 1.204 + $hat = "005E"; 1.205 + $hat = "005E"; 1.206 + $tilde = "007E"; 1.207 + $overscore = "002D"; ## should be 00AF but we can only handle ASCII now 1.208 + $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now 1.209 + $doubleacute = "0022"; 1.210 + $dot = "002E"; 1.211 + $doublegrave = "0060 0060"; 1.212 + 1.213 + 1.214 + $dec =~ s/00A8/$umlaut/eg; 1.215 + $dec =~ s/00AF/$overscore/eg; 1.216 + # $dec =~ s/00B0//eg; 1.217 + $dec =~ s/00B4/$acute/eg; 1.218 + $dec =~ s/00B7/$dot/eg; 1.219 + # $dec =~ s/00B8//eg; 1.220 + $dec =~ s/0300/$grave/eg; 1.221 + $dec =~ s/0301/$acute/eg; 1.222 + $dec =~ s/0302/$hat/eg; 1.223 + $dec =~ s/0303/$tilde/eg; 1.224 + $dec =~ s/0304/$overscore/eg; 1.225 + $dec =~ s/0305/$overscore/eg; 1.226 + #$dec =~ s/0306/?/eg; 1.227 + $dec =~ s/0307/$dot/eg; 1.228 + $dec =~ s/0308/$umlaut/eg; 1.229 + #$dec =~ s/0309/?/eg; 1.230 + #$dec =~ s/030A/?/eg; 1.231 + $dec =~ s/030B/$doubleacute/eg; 1.232 + #$dec =~ s/030C/?/eg; 1.233 + $dec =~ s/030D/$acute/eg; 1.234 + $dec =~ s/030E/$doubleacute/eg; 1.235 + $dec =~ s/030F/$doublegrave/eg; 1.236 + 1.237 + # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others 1.238 + return $dec; 1.239 +} 1.240 +sub rdecompose 1.241 +{ 1.242 + my ($dec) = (@_); 1.243 + if(exists $table{$dec}) { 1.244 + $t = $table{$dec}; 1.245 + $t =~ s/<[a-zA-Z]*>//eg; 1.246 + $t = foldcombining($t); 1.247 + return rdecompose( $table{$t}); 1.248 + } 1.249 + return $dec; 1.250 +} 1.251 +sub decompose 1.252 +{ 1.253 + my ($removeprefix, $dec) = (@_); 1.254 + $removeprefix .= " "; 1.255 + 1.256 + $dec =~ s/$removeprefix//eg; 1.257 + if($dec eq "0020") { 1.258 + $dec = "\\u0020"; 1.259 + } elsif($dec eq "005C") { 1.260 + $dec = "\\u005C"; 1.261 + } else { 1.262 + $k = "\/"; 1.263 + $dec =~ s/2044/$k/eg; 1.264 + $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; 1.265 + $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; 1.266 + $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; 1.267 + $dec =~ s/ //eg; 1.268 + } 1.269 + return $dec; 1.270 +} 1.271 + 1.272 +###################################################################### 1.273 +# 1.274 +# Open the unicode database file 1.275 +# 1.276 +###################################################################### 1.277 +open ( UNICODATA , "< UnicodeData-Latest.txt") 1.278 + || die "cannot find UnicodeData-Latest.txt"; 1.279 + 1.280 +open ( UNICODATA2 , "< UnicodeData-Latest.txt") 1.281 + || die "cannot find UnicodeData-Latest.txt"; 1.282 +###################################################################### 1.283 +# 1.284 +# Open the output file 1.285 +# 1.286 +###################################################################### 1.287 +open ( OUT , "> ../tables/transliterate.properties") 1.288 + || die "cannot open output ../tables/transliterate.properties file"; 1.289 + 1.290 +print OUT $header; 1.291 + 1.292 +# remove comments from $handcoded 1.293 +$handcoded =~ s/^#[^#].*\n//mg; 1.294 +print OUT $handcoded; 1.295 + 1.296 +###################################################################### 1.297 +# 1.298 +# Process the file line by line 1.299 +# 1.300 +###################################################################### 1.301 +while(<UNICODATA2>) { 1.302 + chop; 1.303 + @f = split(/;/ , $_); 1.304 + $udec = hex($u); 1.305 + if(($udec > 256 ) && ($f[5] ne "")) { 1.306 + $table{$f[0]}=$f[5]; 1.307 + } 1.308 +} 1.309 +while(<UNICODATA>) { 1.310 + chop; 1.311 + ###################################################################### 1.312 + # 1.313 + # Get value from fields 1.314 + # 1.315 + ###################################################################### 1.316 + @f = split(/;/ , $_); 1.317 + $u = $f[0]; # The unicode value 1.318 + $cmt = $f[1]; # The comment 1.319 + $dec = $f[5]; # The decomposed value 1.320 + $d1 = $f[6]; 1.321 + $d2 = $f[7]; 1.322 + $d3 = $f[8]; 1.323 + $udec = hex($u); 1.324 + 1.325 + if($udec > 128) 1.326 + { 1.327 + # not ASCII 1.328 + if($dec ne "") 1.329 + { 1.330 + # have decomposition 1.331 + if($dec =~ /</) { 1.332 + # formated decomposition 1.333 + if($dec =~ /<wide>/) { 1.334 + output($u,$cmt,$udec,&decompose("<wide>", $dec)); 1.335 + } elsif($dec =~ /<narrow>/) { 1.336 + # ignore non ASCII decomposition 1.337 + # warning($_); 1.338 + } elsif($dec =~ /<circle>/) { 1.339 + output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")")); 1.340 + } elsif($dec =~ /<fraction>/) { 1.341 + output($u,$cmt,$udec,&decompose("<fraction>", $dec)); 1.342 + } elsif($dec =~ /<small>/) { 1.343 + output($u,$cmt,$udec,&decompose("<small>", $dec)); 1.344 + } elsif($dec =~ /<vertical>/) { 1.345 + # warning($_); 1.346 + } elsif($dec =~ /<super>/) { 1.347 + output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")"); 1.348 + } elsif($dec =~ /<sub>/) { 1.349 + output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")"); 1.350 + } elsif($dec =~ /<font>/) { 1.351 + output($u,$cmt,$udec,&decompose("<font>", $dec)); 1.352 + } elsif($dec =~ /<square>/) { 1.353 + # ignore <square> 1.354 + # warning($_); 1.355 + } elsif($dec =~ /<compat>/) { 1.356 + output($u,$cmt,$udec,&decompose("<compat>", $dec)); 1.357 + } elsif($dec =~ /<isolated>/) { 1.358 + # ignore <isolated> 1.359 + # warning($_); 1.360 + } elsif($dec =~ /<medial>/) { 1.361 + # ignore <medial> 1.362 + # warning($_); 1.363 + } elsif($dec =~ /<final>/) { 1.364 + # ignore <final> 1.365 + # warning($_); 1.366 + } elsif($dec =~ /<initial>/) { 1.367 + # ignore <initial> 1.368 + # warning($_); 1.369 + } elsif($dec =~ /<noBreak>/) { 1.370 + if($dec eq "<noBreak> 0020") 1.371 + { 1.372 + output($u,$cmt,$udec,"\\u0020"); 1.373 + } else { 1.374 + # ignore 1.375 + # warning($_); 1.376 + } 1.377 + } else { 1.378 + warning($_); 1.379 + } 1.380 + } else { 1.381 + # decomposition without format code 1.382 + if($cmt =~ /LATIN/) { 1.383 + $dec = foldcombining($dec); 1.384 + output($u,$cmt,$udec,&decompose("", $dec)); 1.385 + } elsif($cmt =~ /CYRILLIC/) { 1.386 + # ignore 1.387 + # warning($_); 1.388 + } elsif($cmt =~ /GREEK/) { 1.389 + # ignore 1.390 + # warning($_); 1.391 + } elsif($cmt =~ /ARABIC/) { 1.392 + # ignore 1.393 + # warning($_); 1.394 + } elsif($cmt =~ /CJK/) { 1.395 + # ignore 1.396 + # warning($_); 1.397 + } elsif($cmt =~ /HEBREW/) { 1.398 + # ignore 1.399 + # warning($_); 1.400 + } elsif($cmt =~ /DEVANAGARI/) { 1.401 + # ignore 1.402 + # warning($_); 1.403 + } elsif($cmt =~ /BENGALI/) { 1.404 + # ignore 1.405 + # warning($_); 1.406 + } elsif($cmt =~ /GURMUKHI/) { 1.407 + # ignore 1.408 + # warning($_); 1.409 + } elsif($cmt =~ /ORIYA/) { 1.410 + # ignore 1.411 + # warning($_); 1.412 + } elsif($cmt =~ /TAMIL/) { 1.413 + # ignore 1.414 + # warning($_); 1.415 + } elsif($cmt =~ /TELUGU/) { 1.416 + # ignore 1.417 + # warning($_); 1.418 + } elsif($cmt =~ /KANNADA/) { 1.419 + # ignore 1.420 + # warning($_); 1.421 + } elsif($cmt =~ /MALAYALAM/) { 1.422 + # ignore 1.423 + # warning($_); 1.424 + } elsif($cmt =~ /SINHALA/) { 1.425 + # ignore 1.426 + # warning($_); 1.427 + } elsif($cmt =~ /TIBETAN/) { 1.428 + # ignore 1.429 + # warning($_); 1.430 + } elsif($cmt =~ /MYANMAR/) { 1.431 + # ignore 1.432 + # warning($_); 1.433 + } elsif($cmt =~ /KATAKANA/) { 1.434 + # ignore 1.435 + # warning($_); 1.436 + } elsif($cmt =~ /HIRAGANA/) { 1.437 + # ignore 1.438 + # warning($_); 1.439 + } else { 1.440 + # ignore 1.441 + # warning($_); 1.442 + } 1.443 + } 1.444 + } else { 1.445 + # do not have decomposition 1.446 + if ($d1 ne "") 1.447 + { 1.448 + # are numeric characters 1.449 + output($u,$cmt,$udec,$d1); 1.450 + } elsif ($d2 ne "") { 1.451 + if($cmt =~ /CIRCLED/) { 1.452 + # circled 1.453 + output($u,$cmt,$udec,"(".$d2.")"); 1.454 + } else { 1.455 + # others, use [ ] 1.456 + output($u,$cmt,$udec,"[".$d2."]"); 1.457 + } 1.458 + } elsif ($d3 ne "") { 1.459 + if($cmt =~ /CIRCLED/) { 1.460 + # circled 1.461 + output($u,$cmt,$udec,"(".$d3.")"); 1.462 + } else { 1.463 + # others, use [ ] 1.464 + output($u,$cmt,$udec,"[".$d3."]"); 1.465 + } 1.466 + } else { 1.467 + # not numeric characters 1.468 + 1.469 + } # end of no decomposition 1.470 + } # end of have/not decomposition 1.471 + } 1.472 +} 1.473 +###################################################################### 1.474 +# 1.475 +# Close files 1.476 +# 1.477 +###################################################################### 1.478 +close(UNIDATA); 1.479 +close(OUT); 1.480 +