intl/unicharutil/tools/gentransliterate.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 #!/usr/bin/perl
michael@0 2 #
michael@0 3 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 6
michael@0 7 $header = <<END_OF_HEADER;
michael@0 8 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 9 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 10 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 11
michael@0 12 #
michael@0 13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
michael@0 14 # PLEASE DO NOT MODIFY THIS FILE BY HAND
michael@0 15 #
michael@0 16 entity.list.name=transliterate
michael@0 17 entity.169=(c)
michael@0 18 #
michael@0 19 #
michael@0 20 # Here are the windows-1252 characters from the range 0x80 - 0x9F
michael@0 21 #
michael@0 22 END_OF_HEADER
michael@0 23
michael@0 24 $handcoded = <<END_OF_HANDCODED;
michael@0 25 # EURO SIGN
michael@0 26 entity.8364=EUR
michael@0 27 # SINGLE LOW-9 QUOTATION MARK
michael@0 28 entity.8218=,
michael@0 29 # LATIN SMALL LETTER F WITH HOOK
michael@0 30 entity.402=f
michael@0 31 # DOUBLE LOW-9 QUOTATION MARK
michael@0 32 entity.8222="
michael@0 33 # DAGGER
michael@0 34 entity.8224=+
michael@0 35 # DOUBLE DAGGER
michael@0 36 entity.8225=++
michael@0 37 # MODIFIER LETTER CIRCUMFLEX ACCENT
michael@0 38 entity.710=^
michael@0 39 # PER MILLE SIGN
michael@0 40 entity.8240=0/00
michael@0 41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
michael@0 42 entity.8249=<
michael@0 43 # LATIN CAPITAL LIGATURE OE
michael@0 44 entity.338=OE
michael@0 45 # LEFT SINGLE QUOTATION MARK
michael@0 46 entity.8216='
michael@0 47 # RIGHT SINGLE QUOTATION MARK
michael@0 48 entity.8217='
michael@0 49 # LEFT DOUBLE QUOTATION MARK
michael@0 50 entity.8220="
michael@0 51 # RIGHT DOUBLE QUOTATION MARK
michael@0 52 entity.8221="
michael@0 53 # BULLET
michael@0 54 entity.8226=.
michael@0 55 # EN DASH
michael@0 56 entity.8211=--
michael@0 57 # EM DASH
michael@0 58 entity.8212=---
michael@0 59 # SMALL TILDE
michael@0 60 entity.732=~
michael@0 61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
michael@0 62 entity.8250=>
michael@0 63 # LATIN SMALL LIGATURE OE
michael@0 64 entity.339=oe
michael@0 65 # U+2000 EN QUAD
michael@0 66 entity.8192=\\u0020
michael@0 67 # U+2001 EM QUAD
michael@0 68 entity.8193=\\u0020
michael@0 69 # U+2010 HYPHEN
michael@0 70 entity.8208=-
michael@0 71 # U+2011 NON-BREAKING HYPHEN
michael@0 72 entity.8209=-
michael@0 73 # U+2012 FIGURE DASH
michael@0 74 entity.8210=-
michael@0 75 # U+2015 HORIZONTAL BAR
michael@0 76 entity.8213=--
michael@0 77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
michael@0 78 entity.8203=
michael@0 79 # U+2061, ApplyFunction, character showing function application in presentation tagging
michael@0 80 entity.8289=
michael@0 81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
michael@0 82 entity.8290=
michael@0 83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
michael@0 84 entity.8518=d
michael@0 85 # U+2212, MINUS SIGN, official Unicode minus sign
michael@0 86 entity.8722=-
michael@0 87 # Hebrew punctuation
michael@0 88 # U+05BE HEBREW PUNCTUATION MAQAF
michael@0 89 entity.1470=-
michael@0 90 # U+05C0 HEBREW PUNCTUATION PASEQ
michael@0 91 entity.1472=|
michael@0 92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
michael@0 93 entity.1475=:
michael@0 94 # U+05F3 HEBREW PUNCTUATION GERESH
michael@0 95 entity.1523='
michael@0 96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
michael@0 97 entity.1524="
michael@0 98 ##
michael@0 99 ## End of hand coded section
michael@0 100 ## Below are generated from the unicode character database
michael@0 101 ##
michael@0 102 END_OF_HANDCODED
michael@0 103
michael@0 104 @table = ();
michael@0 105 sub FromLatinComment
michael@0 106 {
michael@0 107 my ($cmt) = (@_);
michael@0 108 $char = "";
michael@0 109 if($cmt =~ /PRECEDED BY APOSTROPHE/) {
michael@0 110 $char = "\'";
michael@0 111 }
michael@0 112 if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
michael@0 113 $char = $char . $1;
michael@0 114 }
michael@0 115 if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
michael@0 116 $char = $char . lc($1);
michael@0 117 }
michael@0 118 @f = split(/ / , $cmt);
michael@0 119 while($item = shift @f) {
michael@0 120 if($item eq "DOT") {
michael@0 121 $char .= ".";
michael@0 122 } elsif ($item eq "DIAERESIS") {
michael@0 123 $char .= "\"";
michael@0 124 } elsif ($item eq "BREVE") {
michael@0 125 $char .= "(";
michael@0 126 } elsif ($item eq "ACUTE") {
michael@0 127 $char .= "\'";
michael@0 128 } elsif ($item eq "GRAVE") {
michael@0 129 $char .= "`";
michael@0 130 } elsif ($item eq "TILDE") {
michael@0 131 $char .= "~";
michael@0 132 } elsif ($item eq "CARON") {
michael@0 133 $char .= "(";
michael@0 134 } elsif ($item eq "HOOK") {
michael@0 135 $char .= "?";
michael@0 136 } elsif ($item eq "CEDILLA") {
michael@0 137 $char .= ",";
michael@0 138 } elsif ($item eq "MACRON") {
michael@0 139 $char .= "-";
michael@0 140 } elsif ($item eq "CIRCUMFLEX") {
michael@0 141 $char .= "^";
michael@0 142 } elsif ($item eq "RING") {
michael@0 143 $char .= "*";
michael@0 144 } elsif ($item eq "OGONEK") {
michael@0 145 $char .= ";";
michael@0 146 } elsif ($item eq "LINE") {
michael@0 147 $char .= "_";
michael@0 148 } elsif ($item eq "COMMA") {
michael@0 149 $char .= ",";
michael@0 150 } elsif ($item eq "STROKE") {
michael@0 151 $char .= "/";
michael@0 152 } elsif ($item eq "HORN") {
michael@0 153 $char .= "+";
michael@0 154 } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
michael@0 155 # ignore
michael@0 156 } else {
michael@0 157 #print "AAAA $item\n";
michael@0 158 }
michael@0 159 }
michael@0 160
michael@0 161 return $char;
michael@0 162 }
michael@0 163 sub warning
michael@0 164 {
michael@0 165 my ($warning) = (@_);
michael@0 166 print "WARNING: $warning \n";
michael@0 167 }
michael@0 168 sub doutput
michael@0 169 {
michael@0 170 my ($u, $cmt, $udec, $str) = (@_);
michael@0 171 # don't print out comments - for debugging purposes only
michael@0 172 # print "# U+$u $cmt\n";
michael@0 173 print "entity.$udec=$str\n";
michael@0 174 }
michael@0 175 sub output
michael@0 176 {
michael@0 177 my ($u, $cmt, $udec, $str) = (@_);
michael@0 178 if(decomposeIntoNonASCII($str)) {
michael@0 179 if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
michael@0 180 $str = FromLatinComment($cmt);
michael@0 181 output($u,$cmt,$udec,$str);
michael@0 182 }
michael@0 183 } else {
michael@0 184 # don't print out comments - for debugging purposes only
michael@0 185 # print OUT "# U+$u $cmt\n";
michael@0 186 print OUT "entity.$udec=$str\n";
michael@0 187 }
michael@0 188 }
michael@0 189
michael@0 190 sub decomposeIntoNonASCII
michael@0 191 {
michael@0 192 my ($dec) = (@_);
michael@0 193 return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
michael@0 194 }
michael@0 195
michael@0 196 sub foldcombining
michael@0 197 {
michael@0 198 my ($dec) = (@_);
michael@0 199 $grave = "0060";
michael@0 200 $acute = "0027";
michael@0 201 $hat = "005E";
michael@0 202 $hat = "005E";
michael@0 203 $tilde = "007E";
michael@0 204 $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
michael@0 205 $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
michael@0 206 $doubleacute = "0022";
michael@0 207 $dot = "002E";
michael@0 208 $doublegrave = "0060 0060";
michael@0 209
michael@0 210
michael@0 211 $dec =~ s/00A8/$umlaut/eg;
michael@0 212 $dec =~ s/00AF/$overscore/eg;
michael@0 213 # $dec =~ s/00B0//eg;
michael@0 214 $dec =~ s/00B4/$acute/eg;
michael@0 215 $dec =~ s/00B7/$dot/eg;
michael@0 216 # $dec =~ s/00B8//eg;
michael@0 217 $dec =~ s/0300/$grave/eg;
michael@0 218 $dec =~ s/0301/$acute/eg;
michael@0 219 $dec =~ s/0302/$hat/eg;
michael@0 220 $dec =~ s/0303/$tilde/eg;
michael@0 221 $dec =~ s/0304/$overscore/eg;
michael@0 222 $dec =~ s/0305/$overscore/eg;
michael@0 223 #$dec =~ s/0306/?/eg;
michael@0 224 $dec =~ s/0307/$dot/eg;
michael@0 225 $dec =~ s/0308/$umlaut/eg;
michael@0 226 #$dec =~ s/0309/?/eg;
michael@0 227 #$dec =~ s/030A/?/eg;
michael@0 228 $dec =~ s/030B/$doubleacute/eg;
michael@0 229 #$dec =~ s/030C/?/eg;
michael@0 230 $dec =~ s/030D/$acute/eg;
michael@0 231 $dec =~ s/030E/$doubleacute/eg;
michael@0 232 $dec =~ s/030F/$doublegrave/eg;
michael@0 233
michael@0 234 # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
michael@0 235 return $dec;
michael@0 236 }
michael@0 237 sub rdecompose
michael@0 238 {
michael@0 239 my ($dec) = (@_);
michael@0 240 if(exists $table{$dec}) {
michael@0 241 $t = $table{$dec};
michael@0 242 $t =~ s/<[a-zA-Z]*>//eg;
michael@0 243 $t = foldcombining($t);
michael@0 244 return rdecompose( $table{$t});
michael@0 245 }
michael@0 246 return $dec;
michael@0 247 }
michael@0 248 sub decompose
michael@0 249 {
michael@0 250 my ($removeprefix, $dec) = (@_);
michael@0 251 $removeprefix .= " ";
michael@0 252
michael@0 253 $dec =~ s/$removeprefix//eg;
michael@0 254 if($dec eq "0020") {
michael@0 255 $dec = "\\u0020";
michael@0 256 } elsif($dec eq "005C") {
michael@0 257 $dec = "\\u005C";
michael@0 258 } else {
michael@0 259 $k = "\/";
michael@0 260 $dec =~ s/2044/$k/eg;
michael@0 261 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
michael@0 262 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
michael@0 263 $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
michael@0 264 $dec =~ s/ //eg;
michael@0 265 }
michael@0 266 return $dec;
michael@0 267 }
michael@0 268
michael@0 269 ######################################################################
michael@0 270 #
michael@0 271 # Open the unicode database file
michael@0 272 #
michael@0 273 ######################################################################
michael@0 274 open ( UNICODATA , "< UnicodeData-Latest.txt")
michael@0 275 || die "cannot find UnicodeData-Latest.txt";
michael@0 276
michael@0 277 open ( UNICODATA2 , "< UnicodeData-Latest.txt")
michael@0 278 || die "cannot find UnicodeData-Latest.txt";
michael@0 279 ######################################################################
michael@0 280 #
michael@0 281 # Open the output file
michael@0 282 #
michael@0 283 ######################################################################
michael@0 284 open ( OUT , "> ../tables/transliterate.properties")
michael@0 285 || die "cannot open output ../tables/transliterate.properties file";
michael@0 286
michael@0 287 print OUT $header;
michael@0 288
michael@0 289 # remove comments from $handcoded
michael@0 290 $handcoded =~ s/^#[^#].*\n//mg;
michael@0 291 print OUT $handcoded;
michael@0 292
michael@0 293 ######################################################################
michael@0 294 #
michael@0 295 # Process the file line by line
michael@0 296 #
michael@0 297 ######################################################################
michael@0 298 while(<UNICODATA2>) {
michael@0 299 chop;
michael@0 300 @f = split(/;/ , $_);
michael@0 301 $udec = hex($u);
michael@0 302 if(($udec > 256 ) && ($f[5] ne "")) {
michael@0 303 $table{$f[0]}=$f[5];
michael@0 304 }
michael@0 305 }
michael@0 306 while(<UNICODATA>) {
michael@0 307 chop;
michael@0 308 ######################################################################
michael@0 309 #
michael@0 310 # Get value from fields
michael@0 311 #
michael@0 312 ######################################################################
michael@0 313 @f = split(/;/ , $_);
michael@0 314 $u = $f[0]; # The unicode value
michael@0 315 $cmt = $f[1]; # The comment
michael@0 316 $dec = $f[5]; # The decomposed value
michael@0 317 $d1 = $f[6];
michael@0 318 $d2 = $f[7];
michael@0 319 $d3 = $f[8];
michael@0 320 $udec = hex($u);
michael@0 321
michael@0 322 if($udec > 128)
michael@0 323 {
michael@0 324 # not ASCII
michael@0 325 if($dec ne "")
michael@0 326 {
michael@0 327 # have decomposition
michael@0 328 if($dec =~ /</) {
michael@0 329 # formated decomposition
michael@0 330 if($dec =~ /<wide>/) {
michael@0 331 output($u,$cmt,$udec,&decompose("<wide>", $dec));
michael@0 332 } elsif($dec =~ /<narrow>/) {
michael@0 333 # ignore non ASCII decomposition
michael@0 334 # warning($_);
michael@0 335 } elsif($dec =~ /<circle>/) {
michael@0 336 output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
michael@0 337 } elsif($dec =~ /<fraction>/) {
michael@0 338 output($u,$cmt,$udec,&decompose("<fraction>", $dec));
michael@0 339 } elsif($dec =~ /<small>/) {
michael@0 340 output($u,$cmt,$udec,&decompose("<small>", $dec));
michael@0 341 } elsif($dec =~ /<vertical>/) {
michael@0 342 # warning($_);
michael@0 343 } elsif($dec =~ /<super>/) {
michael@0 344 output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
michael@0 345 } elsif($dec =~ /<sub>/) {
michael@0 346 output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
michael@0 347 } elsif($dec =~ /<font>/) {
michael@0 348 output($u,$cmt,$udec,&decompose("<font>", $dec));
michael@0 349 } elsif($dec =~ /<square>/) {
michael@0 350 # ignore <square>
michael@0 351 # warning($_);
michael@0 352 } elsif($dec =~ /<compat>/) {
michael@0 353 output($u,$cmt,$udec,&decompose("<compat>", $dec));
michael@0 354 } elsif($dec =~ /<isolated>/) {
michael@0 355 # ignore <isolated>
michael@0 356 # warning($_);
michael@0 357 } elsif($dec =~ /<medial>/) {
michael@0 358 # ignore <medial>
michael@0 359 # warning($_);
michael@0 360 } elsif($dec =~ /<final>/) {
michael@0 361 # ignore <final>
michael@0 362 # warning($_);
michael@0 363 } elsif($dec =~ /<initial>/) {
michael@0 364 # ignore <initial>
michael@0 365 # warning($_);
michael@0 366 } elsif($dec =~ /<noBreak>/) {
michael@0 367 if($dec eq "<noBreak> 0020")
michael@0 368 {
michael@0 369 output($u,$cmt,$udec,"\\u0020");
michael@0 370 } else {
michael@0 371 # ignore
michael@0 372 # warning($_);
michael@0 373 }
michael@0 374 } else {
michael@0 375 warning($_);
michael@0 376 }
michael@0 377 } else {
michael@0 378 # decomposition without format code
michael@0 379 if($cmt =~ /LATIN/) {
michael@0 380 $dec = foldcombining($dec);
michael@0 381 output($u,$cmt,$udec,&decompose("", $dec));
michael@0 382 } elsif($cmt =~ /CYRILLIC/) {
michael@0 383 # ignore
michael@0 384 # warning($_);
michael@0 385 } elsif($cmt =~ /GREEK/) {
michael@0 386 # ignore
michael@0 387 # warning($_);
michael@0 388 } elsif($cmt =~ /ARABIC/) {
michael@0 389 # ignore
michael@0 390 # warning($_);
michael@0 391 } elsif($cmt =~ /CJK/) {
michael@0 392 # ignore
michael@0 393 # warning($_);
michael@0 394 } elsif($cmt =~ /HEBREW/) {
michael@0 395 # ignore
michael@0 396 # warning($_);
michael@0 397 } elsif($cmt =~ /DEVANAGARI/) {
michael@0 398 # ignore
michael@0 399 # warning($_);
michael@0 400 } elsif($cmt =~ /BENGALI/) {
michael@0 401 # ignore
michael@0 402 # warning($_);
michael@0 403 } elsif($cmt =~ /GURMUKHI/) {
michael@0 404 # ignore
michael@0 405 # warning($_);
michael@0 406 } elsif($cmt =~ /ORIYA/) {
michael@0 407 # ignore
michael@0 408 # warning($_);
michael@0 409 } elsif($cmt =~ /TAMIL/) {
michael@0 410 # ignore
michael@0 411 # warning($_);
michael@0 412 } elsif($cmt =~ /TELUGU/) {
michael@0 413 # ignore
michael@0 414 # warning($_);
michael@0 415 } elsif($cmt =~ /KANNADA/) {
michael@0 416 # ignore
michael@0 417 # warning($_);
michael@0 418 } elsif($cmt =~ /MALAYALAM/) {
michael@0 419 # ignore
michael@0 420 # warning($_);
michael@0 421 } elsif($cmt =~ /SINHALA/) {
michael@0 422 # ignore
michael@0 423 # warning($_);
michael@0 424 } elsif($cmt =~ /TIBETAN/) {
michael@0 425 # ignore
michael@0 426 # warning($_);
michael@0 427 } elsif($cmt =~ /MYANMAR/) {
michael@0 428 # ignore
michael@0 429 # warning($_);
michael@0 430 } elsif($cmt =~ /KATAKANA/) {
michael@0 431 # ignore
michael@0 432 # warning($_);
michael@0 433 } elsif($cmt =~ /HIRAGANA/) {
michael@0 434 # ignore
michael@0 435 # warning($_);
michael@0 436 } else {
michael@0 437 # ignore
michael@0 438 # warning($_);
michael@0 439 }
michael@0 440 }
michael@0 441 } else {
michael@0 442 # do not have decomposition
michael@0 443 if ($d1 ne "")
michael@0 444 {
michael@0 445 # are numeric characters
michael@0 446 output($u,$cmt,$udec,$d1);
michael@0 447 } elsif ($d2 ne "") {
michael@0 448 if($cmt =~ /CIRCLED/) {
michael@0 449 # circled
michael@0 450 output($u,$cmt,$udec,"(".$d2.")");
michael@0 451 } else {
michael@0 452 # others, use [ ]
michael@0 453 output($u,$cmt,$udec,"[".$d2."]");
michael@0 454 }
michael@0 455 } elsif ($d3 ne "") {
michael@0 456 if($cmt =~ /CIRCLED/) {
michael@0 457 # circled
michael@0 458 output($u,$cmt,$udec,"(".$d3.")");
michael@0 459 } else {
michael@0 460 # others, use [ ]
michael@0 461 output($u,$cmt,$udec,"[".$d3."]");
michael@0 462 }
michael@0 463 } else {
michael@0 464 # not numeric characters
michael@0 465
michael@0 466 } # end of no decomposition
michael@0 467 } # end of have/not decomposition
michael@0 468 }
michael@0 469 }
michael@0 470 ######################################################################
michael@0 471 #
michael@0 472 # Close files
michael@0 473 #
michael@0 474 ######################################################################
michael@0 475 close(UNIDATA);
michael@0 476 close(OUT);
michael@0 477

mercurial