intl/unicharutil/tools/gentransliterate.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/tools/gentransliterate.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,477 @@
     1.4 +#!/usr/bin/perl 
     1.5 +#
     1.6 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.9 +
    1.10 +$header = <<END_OF_HEADER;
    1.11 +# This Source Code Form is subject to the terms of the Mozilla Public
    1.12 +# License, v. 2.0. If a copy of the MPL was not distributed with this
    1.13 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
    1.14 +
    1.15 +# 
    1.16 +# THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
    1.17 +# PLEASE DO NOT MODIFY THIS FILE BY HAND
    1.18 +#
    1.19 +entity.list.name=transliterate
    1.20 +entity.169=(c)
    1.21 +#
    1.22 +#
    1.23 +# Here are the windows-1252 characters from the range 0x80 - 0x9F
    1.24 +#
    1.25 +END_OF_HEADER
    1.26 +
    1.27 +$handcoded = <<END_OF_HANDCODED;
    1.28 +# EURO SIGN
    1.29 +entity.8364=EUR
    1.30 +# SINGLE LOW-9 QUOTATION MARK
    1.31 +entity.8218=,
    1.32 +# LATIN SMALL LETTER F WITH HOOK
    1.33 +entity.402=f
    1.34 +# DOUBLE LOW-9 QUOTATION MARK
    1.35 +entity.8222="
    1.36 +# DAGGER
    1.37 +entity.8224=+
    1.38 +# DOUBLE DAGGER
    1.39 +entity.8225=++
    1.40 +# MODIFIER LETTER CIRCUMFLEX ACCENT
    1.41 +entity.710=^
    1.42 +# PER MILLE SIGN
    1.43 +entity.8240=0/00
    1.44 +# SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    1.45 +entity.8249=<
    1.46 +# LATIN CAPITAL LIGATURE OE
    1.47 +entity.338=OE
    1.48 +# LEFT SINGLE QUOTATION MARK
    1.49 +entity.8216='
    1.50 +# RIGHT SINGLE QUOTATION MARK
    1.51 +entity.8217='
    1.52 +# LEFT DOUBLE QUOTATION MARK
    1.53 +entity.8220="
    1.54 +# RIGHT DOUBLE QUOTATION MARK
    1.55 +entity.8221="
    1.56 +# BULLET
    1.57 +entity.8226=.
    1.58 +# EN DASH
    1.59 +entity.8211=--
    1.60 +# EM DASH
    1.61 +entity.8212=---
    1.62 +# SMALL TILDE
    1.63 +entity.732=~
    1.64 +# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    1.65 +entity.8250=>
    1.66 +# LATIN SMALL LIGATURE OE
    1.67 +entity.339=oe
    1.68 +# U+2000 EN QUAD
    1.69 +entity.8192=\\u0020
    1.70 +# U+2001 EM QUAD
    1.71 +entity.8193=\\u0020
    1.72 +# U+2010 HYPHEN
    1.73 +entity.8208=-
    1.74 +# U+2011 NON-BREAKING HYPHEN
    1.75 +entity.8209=-
    1.76 +# U+2012 FIGURE DASH
    1.77 +entity.8210=-
    1.78 +# U+2015 HORIZONTAL BAR
    1.79 +entity.8213=--
    1.80 +# U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
    1.81 +entity.8203=
    1.82 +# U+2061, ApplyFunction, character showing function application in presentation tagging
    1.83 +entity.8289=
    1.84 +# U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
    1.85 +entity.8290=
    1.86 +# U+2146, DifferentialD, d for use in differentials, e.g., within integrals
    1.87 +entity.8518=d
    1.88 +# U+2212, MINUS SIGN, official Unicode minus sign
    1.89 +entity.8722=-
    1.90 +# Hebrew punctuation
    1.91 +# U+05BE HEBREW PUNCTUATION MAQAF
    1.92 +entity.1470=-
    1.93 +# U+05C0 HEBREW PUNCTUATION PASEQ
    1.94 +entity.1472=|
    1.95 +# U+05C3 HEBREW PUNCTUATION SOF PASUQ
    1.96 +entity.1475=:
    1.97 +# U+05F3 HEBREW PUNCTUATION GERESH
    1.98 +entity.1523='
    1.99 +# U+05F4 HEBREW PUNCTUATION GERSHAYIM
   1.100 +entity.1524="
   1.101 +##
   1.102 +## End of hand coded section
   1.103 +## Below are generated from the unicode character database
   1.104 +##
   1.105 +END_OF_HANDCODED
   1.106 +
   1.107 +@table = ();
   1.108 +sub FromLatinComment
   1.109 +{
   1.110 +  my ($cmt) = (@_);
   1.111 +  $char = "";
   1.112 +  if($cmt =~ /PRECEDED BY APOSTROPHE/) {
   1.113 +      $char = "\'";
   1.114 +  }
   1.115 +  if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
   1.116 +      $char = $char . $1;
   1.117 +  }
   1.118 +  if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
   1.119 +      $char = $char . lc($1);
   1.120 +  }
   1.121 +  @f = split(/ / , $cmt); 
   1.122 +  while($item = shift @f) {
   1.123 +     if($item eq "DOT") {
   1.124 +       $char .= ".";
   1.125 +     } elsif ($item eq "DIAERESIS") {
   1.126 +       $char .= "\"";
   1.127 +     } elsif ($item eq "BREVE") {
   1.128 +       $char .= "(";
   1.129 +     } elsif ($item eq "ACUTE") {
   1.130 +       $char .= "\'";
   1.131 +     } elsif ($item eq "GRAVE") {
   1.132 +       $char .= "`";
   1.133 +     } elsif ($item eq "TILDE") {
   1.134 +       $char .= "~";
   1.135 +     } elsif ($item eq "CARON") {
   1.136 +       $char .= "(";
   1.137 +     } elsif ($item eq "HOOK") {
   1.138 +       $char .= "?";
   1.139 +     } elsif ($item eq "CEDILLA") {
   1.140 +       $char .= ",";
   1.141 +     } elsif ($item eq "MACRON") {
   1.142 +       $char .= "-";
   1.143 +     } elsif ($item eq "CIRCUMFLEX") {
   1.144 +       $char .= "^";
   1.145 +     } elsif ($item eq "RING") {
   1.146 +       $char .= "*";
   1.147 +     } elsif ($item eq "OGONEK") {
   1.148 +       $char .= ";";
   1.149 +     } elsif ($item eq "LINE") {
   1.150 +       $char .= "_";
   1.151 +     } elsif ($item eq "COMMA") {
   1.152 +       $char .= ",";
   1.153 +     } elsif ($item eq "STROKE") {
   1.154 +       $char .= "/";
   1.155 +     } elsif ($item eq "HORN") {
   1.156 +       $char .= "+";
   1.157 +     } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
   1.158 +       # ignore
   1.159 +     } else {
   1.160 +       #print "AAAA $item\n";
   1.161 +     }
   1.162 +  }
   1.163 +  
   1.164 +  return $char;
   1.165 +}
   1.166 +sub warning
   1.167 +{
   1.168 +  my ($warning) = (@_);
   1.169 +  print "WARNING: $warning \n";
   1.170 +}
   1.171 +sub doutput
   1.172 +{
   1.173 +  my ($u, $cmt, $udec, $str) = (@_);
   1.174 +  # don't print out comments - for debugging purposes only
   1.175 +  # print "# U+$u $cmt\n";
   1.176 +  print "entity.$udec=$str\n";
   1.177 +}
   1.178 +sub output
   1.179 +{
   1.180 +  my ($u, $cmt, $udec, $str) = (@_);
   1.181 +  if(decomposeIntoNonASCII($str)) {
   1.182 +    if(($cmt =~ "LATIN")  && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
   1.183 +       $str = FromLatinComment($cmt);
   1.184 +       output($u,$cmt,$udec,$str);
   1.185 +    }
   1.186 +  } else {
   1.187 +    # don't print out comments - for debugging purposes only
   1.188 +    # print OUT "# U+$u $cmt\n";
   1.189 +    print OUT "entity.$udec=$str\n";
   1.190 +  }
   1.191 +}
   1.192 +
   1.193 +sub decomposeIntoNonASCII
   1.194 +{
   1.195 +  my ($dec) = (@_);
   1.196 +  return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
   1.197 +}
   1.198 +
   1.199 +sub foldcombining
   1.200 +{
   1.201 +  my ($dec) = (@_);
   1.202 +  $grave = "0060";
   1.203 +  $acute = "0027";
   1.204 +  $hat = "005E";
   1.205 +  $hat = "005E";
   1.206 +  $tilde = "007E";
   1.207 +  $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
   1.208 +  $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
   1.209 +  $doubleacute = "0022";
   1.210 +  $dot = "002E";
   1.211 +  $doublegrave = "0060 0060";
   1.212 +
   1.213 +
   1.214 +  $dec =~ s/00A8/$umlaut/eg;
   1.215 +  $dec =~ s/00AF/$overscore/eg;
   1.216 + # $dec =~ s/00B0//eg;
   1.217 +  $dec =~ s/00B4/$acute/eg;
   1.218 +  $dec =~ s/00B7/$dot/eg;
   1.219 + # $dec =~ s/00B8//eg;
   1.220 +  $dec =~ s/0300/$grave/eg;
   1.221 +  $dec =~ s/0301/$acute/eg;
   1.222 +  $dec =~ s/0302/$hat/eg;
   1.223 +  $dec =~ s/0303/$tilde/eg;
   1.224 +  $dec =~ s/0304/$overscore/eg;
   1.225 +  $dec =~ s/0305/$overscore/eg;
   1.226 + #$dec =~ s/0306/?/eg;
   1.227 +  $dec =~ s/0307/$dot/eg;
   1.228 +  $dec =~ s/0308/$umlaut/eg;
   1.229 + #$dec =~ s/0309/?/eg;
   1.230 + #$dec =~ s/030A/?/eg;
   1.231 +  $dec =~ s/030B/$doubleacute/eg;
   1.232 + #$dec =~ s/030C/?/eg;
   1.233 +  $dec =~ s/030D/$acute/eg;
   1.234 +  $dec =~ s/030E/$doubleacute/eg;
   1.235 +  $dec =~ s/030F/$doublegrave/eg;
   1.236 +
   1.237 + # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
   1.238 +  return $dec;
   1.239 +}
   1.240 +sub rdecompose
   1.241 +{
   1.242 +  my ($dec) = (@_);
   1.243 +  if(exists $table{$dec}) {
   1.244 +    $t = $table{$dec};
   1.245 +    $t =~ s/<[a-zA-Z]*>//eg;
   1.246 +    $t = foldcombining($t);
   1.247 +    return rdecompose( $table{$t});
   1.248 +  }
   1.249 +  return $dec;
   1.250 +}
   1.251 +sub decompose
   1.252 +{
   1.253 +  my ($removeprefix, $dec) = (@_);
   1.254 +  $removeprefix .= " ";
   1.255 +  
   1.256 +  $dec =~ s/$removeprefix//eg;
   1.257 +  if($dec eq "0020") {
   1.258 +   $dec = "\\u0020";
   1.259 +  } elsif($dec eq "005C") {
   1.260 +   $dec = "\\u005C";
   1.261 +  } else {
   1.262 +   $k = "\/";
   1.263 +   $dec =~ s/2044/$k/eg;
   1.264 +   $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
   1.265 +   $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
   1.266 +   $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
   1.267 +   $dec =~ s/ //eg;
   1.268 +  } 
   1.269 +  return $dec;
   1.270 +}
   1.271 +
   1.272 +######################################################################
   1.273 +#
   1.274 +# Open the unicode database file
   1.275 +#
   1.276 +######################################################################
   1.277 +open ( UNICODATA , "< UnicodeData-Latest.txt") 
   1.278 +   || die "cannot find UnicodeData-Latest.txt";
   1.279 +
   1.280 +open ( UNICODATA2 , "< UnicodeData-Latest.txt") 
   1.281 +   || die "cannot find UnicodeData-Latest.txt";
   1.282 +######################################################################
   1.283 +#
   1.284 +# Open the output file
   1.285 +#
   1.286 +######################################################################
   1.287 +open ( OUT , "> ../tables/transliterate.properties") 
   1.288 +  || die "cannot open output ../tables/transliterate.properties file";
   1.289 +
   1.290 +print OUT $header;
   1.291 +
   1.292 +# remove comments from $handcoded
   1.293 +$handcoded =~ s/^#[^#].*\n//mg;
   1.294 +print OUT $handcoded;
   1.295 +
   1.296 +######################################################################
   1.297 +#
   1.298 +# Process the file line by line
   1.299 +#
   1.300 +######################################################################
   1.301 +while(<UNICODATA2>) {
   1.302 +   chop;
   1.303 +   @f = split(/;/ , $_); 
   1.304 +   $udec = hex($u);
   1.305 +   if(($udec > 256 ) && ($f[5] ne "")) {
   1.306 +     $table{$f[0]}=$f[5];
   1.307 +   }
   1.308 +}
   1.309 +while(<UNICODATA>) {
   1.310 +   chop;
   1.311 +   ######################################################################
   1.312 +   #
   1.313 +   # Get value from fields
   1.314 +   #
   1.315 +   ######################################################################
   1.316 +   @f = split(/;/ , $_); 
   1.317 +   $u = $f[0];    # The unicode value
   1.318 +   $cmt = $f[1];  # The comment
   1.319 +   $dec = $f[5];  # The decomposed value
   1.320 +   $d1 = $f[6];  
   1.321 +   $d2 = $f[7];  
   1.322 +   $d3 = $f[8];  
   1.323 +   $udec = hex($u);
   1.324 +
   1.325 +   if($udec > 128) 
   1.326 +   {
   1.327 +     # not ASCII
   1.328 +     if($dec ne "") 
   1.329 +     {
   1.330 +       # have decomposition
   1.331 +       if($dec =~ /</)  {
   1.332 +           # formated decomposition
   1.333 +           if($dec =~ /<wide>/)  {
   1.334 +              output($u,$cmt,$udec,&decompose("<wide>", $dec));
   1.335 +           } elsif($dec =~ /<narrow>/)  {
   1.336 +              # ignore non ASCII decomposition
   1.337 +              # warning($_);
   1.338 +           } elsif($dec =~ /<circle>/)  {
   1.339 +              output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
   1.340 +           } elsif($dec =~ /<fraction>/)  {
   1.341 +              output($u,$cmt,$udec,&decompose("<fraction>", $dec));
   1.342 +           } elsif($dec =~ /<small>/)  {
   1.343 +              output($u,$cmt,$udec,&decompose("<small>", $dec));
   1.344 +           } elsif($dec =~ /<vertical>/)  {
   1.345 +              # warning($_);
   1.346 +           } elsif($dec =~ /<super>/)  {
   1.347 +              output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
   1.348 +           } elsif($dec =~ /<sub>/)  {
   1.349 +              output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
   1.350 +           } elsif($dec =~ /<font>/)  {
   1.351 +               output($u,$cmt,$udec,&decompose("<font>", $dec));
   1.352 +           } elsif($dec =~ /<square>/)  {
   1.353 +              # ignore <square>
   1.354 +              # warning($_);
   1.355 +           } elsif($dec =~ /<compat>/)  {
   1.356 +               output($u,$cmt,$udec,&decompose("<compat>", $dec));
   1.357 +           } elsif($dec =~ /<isolated>/)  {
   1.358 +              # ignore <isolated>
   1.359 +              # warning($_);
   1.360 +           } elsif($dec =~ /<medial>/)  {
   1.361 +              # ignore <medial>
   1.362 +              # warning($_);
   1.363 +           } elsif($dec =~ /<final>/)  {
   1.364 +              # ignore <final>
   1.365 +              # warning($_);
   1.366 +           } elsif($dec =~ /<initial>/)  {
   1.367 +              # ignore <initial>
   1.368 +              # warning($_);
   1.369 +           } elsif($dec =~ /<noBreak>/)  {
   1.370 +             if($dec eq "<noBreak> 0020")
   1.371 +             {
   1.372 +               output($u,$cmt,$udec,"\\u0020");
   1.373 +             } else {
   1.374 +              # ignore 
   1.375 +              # warning($_);
   1.376 +             }
   1.377 +           } else {
   1.378 +             warning($_);
   1.379 +           }
   1.380 +       } else {
   1.381 +         # decomposition without format code
   1.382 +         if($cmt =~ /LATIN/) {
   1.383 +           $dec = foldcombining($dec);
   1.384 +              output($u,$cmt,$udec,&decompose("", $dec));
   1.385 +         } elsif($cmt =~ /CYRILLIC/) {
   1.386 +              # ignore 
   1.387 +              # warning($_);
   1.388 +         } elsif($cmt =~ /GREEK/) {
   1.389 +              # ignore 
   1.390 +              # warning($_);
   1.391 +         } elsif($cmt =~ /ARABIC/) {
   1.392 +              # ignore 
   1.393 +              # warning($_);
   1.394 +         } elsif($cmt =~ /CJK/) {
   1.395 +              # ignore 
   1.396 +              # warning($_);
   1.397 +         } elsif($cmt =~ /HEBREW/) {
   1.398 +              # ignore 
   1.399 +              # warning($_);
   1.400 +         } elsif($cmt =~ /DEVANAGARI/) {
   1.401 +              # ignore 
   1.402 +              # warning($_);
   1.403 +         } elsif($cmt =~ /BENGALI/) {
   1.404 +              # ignore 
   1.405 +              # warning($_);
   1.406 +         } elsif($cmt =~ /GURMUKHI/) {
   1.407 +              # ignore 
   1.408 +              # warning($_);
   1.409 +         } elsif($cmt =~ /ORIYA/) {
   1.410 +              # ignore 
   1.411 +              # warning($_);
   1.412 +         } elsif($cmt =~ /TAMIL/) {
   1.413 +              # ignore 
   1.414 +              # warning($_);
   1.415 +         } elsif($cmt =~ /TELUGU/) {
   1.416 +              # ignore 
   1.417 +              # warning($_);
   1.418 +         } elsif($cmt =~ /KANNADA/) {
   1.419 +              # ignore 
   1.420 +              # warning($_);
   1.421 +         } elsif($cmt =~ /MALAYALAM/) {
   1.422 +              # ignore 
   1.423 +              # warning($_);
   1.424 +         } elsif($cmt =~ /SINHALA/) {
   1.425 +              # ignore 
   1.426 +              # warning($_);
   1.427 +         } elsif($cmt =~ /TIBETAN/) {
   1.428 +              # ignore 
   1.429 +              # warning($_);
   1.430 +         } elsif($cmt =~ /MYANMAR/) {
   1.431 +              # ignore 
   1.432 +              # warning($_);
   1.433 +         } elsif($cmt =~ /KATAKANA/) {
   1.434 +              # ignore 
   1.435 +              # warning($_);
   1.436 +         } elsif($cmt =~ /HIRAGANA/) {
   1.437 +              # ignore 
   1.438 +              # warning($_);
   1.439 +         } else {
   1.440 +              # ignore 
   1.441 +              # warning($_);
   1.442 +         }
   1.443 +       }
   1.444 +     } else {
   1.445 +       # do not have decomposition
   1.446 +       if ($d1 ne "") 
   1.447 +       {
   1.448 +         # are numeric characters
   1.449 +         output($u,$cmt,$udec,$d1);
   1.450 +       } elsif ($d2 ne "") {
   1.451 +         if($cmt =~ /CIRCLED/) {
   1.452 +           # circled
   1.453 +           output($u,$cmt,$udec,"(".$d2.")");
   1.454 +         } else {
   1.455 +           # others, use [ ]
   1.456 +           output($u,$cmt,$udec,"[".$d2."]");
   1.457 +         }
   1.458 +       } elsif ($d3 ne "") {
   1.459 +         if($cmt =~ /CIRCLED/) {
   1.460 +           # circled
   1.461 +           output($u,$cmt,$udec,"(".$d3.")");
   1.462 +         } else {
   1.463 +           # others, use [ ]
   1.464 +           output($u,$cmt,$udec,"[".$d3."]");
   1.465 +         }
   1.466 +       } else {
   1.467 +         # not numeric characters
   1.468 +
   1.469 +       } # end of no decomposition
   1.470 +     } # end of have/not decomposition
   1.471 +   }
   1.472 +}
   1.473 +######################################################################
   1.474 +#
   1.475 +# Close files
   1.476 +#
   1.477 +######################################################################
   1.478 +close(UNIDATA);
   1.479 +close(OUT);
   1.480 +

mercurial