intl/unicharutil/tools/gentransliterate.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 #!/usr/bin/perl 
     2 #
     3 # This Source Code Form is subject to the terms of the Mozilla Public
     4 # License, v. 2.0. If a copy of the MPL was not distributed with this
     5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     7 $header = <<END_OF_HEADER;
     8 # This Source Code Form is subject to the terms of the Mozilla Public
     9 # License, v. 2.0. If a copy of the MPL was not distributed with this
    10 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
    12 # 
    13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
    14 # PLEASE DO NOT MODIFY THIS FILE BY HAND
    15 #
    16 entity.list.name=transliterate
    17 entity.169=(c)
    18 #
    19 #
    20 # Here are the windows-1252 characters from the range 0x80 - 0x9F
    21 #
    22 END_OF_HEADER
    24 $handcoded = <<END_OF_HANDCODED;
    25 # EURO SIGN
    26 entity.8364=EUR
    27 # SINGLE LOW-9 QUOTATION MARK
    28 entity.8218=,
    29 # LATIN SMALL LETTER F WITH HOOK
    30 entity.402=f
    31 # DOUBLE LOW-9 QUOTATION MARK
    32 entity.8222="
    33 # DAGGER
    34 entity.8224=+
    35 # DOUBLE DAGGER
    36 entity.8225=++
    37 # MODIFIER LETTER CIRCUMFLEX ACCENT
    38 entity.710=^
    39 # PER MILLE SIGN
    40 entity.8240=0/00
    41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    42 entity.8249=<
    43 # LATIN CAPITAL LIGATURE OE
    44 entity.338=OE
    45 # LEFT SINGLE QUOTATION MARK
    46 entity.8216='
    47 # RIGHT SINGLE QUOTATION MARK
    48 entity.8217='
    49 # LEFT DOUBLE QUOTATION MARK
    50 entity.8220="
    51 # RIGHT DOUBLE QUOTATION MARK
    52 entity.8221="
    53 # BULLET
    54 entity.8226=.
    55 # EN DASH
    56 entity.8211=--
    57 # EM DASH
    58 entity.8212=---
    59 # SMALL TILDE
    60 entity.732=~
    61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    62 entity.8250=>
    63 # LATIN SMALL LIGATURE OE
    64 entity.339=oe
    65 # U+2000 EN QUAD
    66 entity.8192=\\u0020
    67 # U+2001 EM QUAD
    68 entity.8193=\\u0020
    69 # U+2010 HYPHEN
    70 entity.8208=-
    71 # U+2011 NON-BREAKING HYPHEN
    72 entity.8209=-
    73 # U+2012 FIGURE DASH
    74 entity.8210=-
    75 # U+2015 HORIZONTAL BAR
    76 entity.8213=--
    77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
    78 entity.8203=
    79 # U+2061, ApplyFunction, character showing function application in presentation tagging
    80 entity.8289=
    81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
    82 entity.8290=
    83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
    84 entity.8518=d
    85 # U+2212, MINUS SIGN, official Unicode minus sign
    86 entity.8722=-
    87 # Hebrew punctuation
    88 # U+05BE HEBREW PUNCTUATION MAQAF
    89 entity.1470=-
    90 # U+05C0 HEBREW PUNCTUATION PASEQ
    91 entity.1472=|
    92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
    93 entity.1475=:
    94 # U+05F3 HEBREW PUNCTUATION GERESH
    95 entity.1523='
    96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
    97 entity.1524="
    98 ##
    99 ## End of hand coded section
   100 ## Below are generated from the unicode character database
   101 ##
   102 END_OF_HANDCODED
   104 @table = ();
   105 sub FromLatinComment
   106 {
   107   my ($cmt) = (@_);
   108   $char = "";
   109   if($cmt =~ /PRECEDED BY APOSTROPHE/) {
   110       $char = "\'";
   111   }
   112   if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
   113       $char = $char . $1;
   114   }
   115   if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
   116       $char = $char . lc($1);
   117   }
   118   @f = split(/ / , $cmt); 
   119   while($item = shift @f) {
   120      if($item eq "DOT") {
   121        $char .= ".";
   122      } elsif ($item eq "DIAERESIS") {
   123        $char .= "\"";
   124      } elsif ($item eq "BREVE") {
   125        $char .= "(";
   126      } elsif ($item eq "ACUTE") {
   127        $char .= "\'";
   128      } elsif ($item eq "GRAVE") {
   129        $char .= "`";
   130      } elsif ($item eq "TILDE") {
   131        $char .= "~";
   132      } elsif ($item eq "CARON") {
   133        $char .= "(";
   134      } elsif ($item eq "HOOK") {
   135        $char .= "?";
   136      } elsif ($item eq "CEDILLA") {
   137        $char .= ",";
   138      } elsif ($item eq "MACRON") {
   139        $char .= "-";
   140      } elsif ($item eq "CIRCUMFLEX") {
   141        $char .= "^";
   142      } elsif ($item eq "RING") {
   143        $char .= "*";
   144      } elsif ($item eq "OGONEK") {
   145        $char .= ";";
   146      } elsif ($item eq "LINE") {
   147        $char .= "_";
   148      } elsif ($item eq "COMMA") {
   149        $char .= ",";
   150      } elsif ($item eq "STROKE") {
   151        $char .= "/";
   152      } elsif ($item eq "HORN") {
   153        $char .= "+";
   154      } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
   155        # ignore
   156      } else {
   157        #print "AAAA $item\n";
   158      }
   159   }
   161   return $char;
   162 }
   163 sub warning
   164 {
   165   my ($warning) = (@_);
   166   print "WARNING: $warning \n";
   167 }
   168 sub doutput
   169 {
   170   my ($u, $cmt, $udec, $str) = (@_);
   171   # don't print out comments - for debugging purposes only
   172   # print "# U+$u $cmt\n";
   173   print "entity.$udec=$str\n";
   174 }
   175 sub output
   176 {
   177   my ($u, $cmt, $udec, $str) = (@_);
   178   if(decomposeIntoNonASCII($str)) {
   179     if(($cmt =~ "LATIN")  && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
   180        $str = FromLatinComment($cmt);
   181        output($u,$cmt,$udec,$str);
   182     }
   183   } else {
   184     # don't print out comments - for debugging purposes only
   185     # print OUT "# U+$u $cmt\n";
   186     print OUT "entity.$udec=$str\n";
   187   }
   188 }
   190 sub decomposeIntoNonASCII
   191 {
   192   my ($dec) = (@_);
   193   return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
   194 }
   196 sub foldcombining
   197 {
   198   my ($dec) = (@_);
   199   $grave = "0060";
   200   $acute = "0027";
   201   $hat = "005E";
   202   $hat = "005E";
   203   $tilde = "007E";
   204   $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
   205   $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
   206   $doubleacute = "0022";
   207   $dot = "002E";
   208   $doublegrave = "0060 0060";
   211   $dec =~ s/00A8/$umlaut/eg;
   212   $dec =~ s/00AF/$overscore/eg;
   213  # $dec =~ s/00B0//eg;
   214   $dec =~ s/00B4/$acute/eg;
   215   $dec =~ s/00B7/$dot/eg;
   216  # $dec =~ s/00B8//eg;
   217   $dec =~ s/0300/$grave/eg;
   218   $dec =~ s/0301/$acute/eg;
   219   $dec =~ s/0302/$hat/eg;
   220   $dec =~ s/0303/$tilde/eg;
   221   $dec =~ s/0304/$overscore/eg;
   222   $dec =~ s/0305/$overscore/eg;
   223  #$dec =~ s/0306/?/eg;
   224   $dec =~ s/0307/$dot/eg;
   225   $dec =~ s/0308/$umlaut/eg;
   226  #$dec =~ s/0309/?/eg;
   227  #$dec =~ s/030A/?/eg;
   228   $dec =~ s/030B/$doubleacute/eg;
   229  #$dec =~ s/030C/?/eg;
   230   $dec =~ s/030D/$acute/eg;
   231   $dec =~ s/030E/$doubleacute/eg;
   232   $dec =~ s/030F/$doublegrave/eg;
   234  # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
   235   return $dec;
   236 }
   237 sub rdecompose
   238 {
   239   my ($dec) = (@_);
   240   if(exists $table{$dec}) {
   241     $t = $table{$dec};
   242     $t =~ s/<[a-zA-Z]*>//eg;
   243     $t = foldcombining($t);
   244     return rdecompose( $table{$t});
   245   }
   246   return $dec;
   247 }
   248 sub decompose
   249 {
   250   my ($removeprefix, $dec) = (@_);
   251   $removeprefix .= " ";
   253   $dec =~ s/$removeprefix//eg;
   254   if($dec eq "0020") {
   255    $dec = "\\u0020";
   256   } elsif($dec eq "005C") {
   257    $dec = "\\u005C";
   258   } else {
   259    $k = "\/";
   260    $dec =~ s/2044/$k/eg;
   261    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
   262    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
   263    $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
   264    $dec =~ s/ //eg;
   265   } 
   266   return $dec;
   267 }
   269 ######################################################################
   270 #
   271 # Open the unicode database file
   272 #
   273 ######################################################################
   274 open ( UNICODATA , "< UnicodeData-Latest.txt") 
   275    || die "cannot find UnicodeData-Latest.txt";
   277 open ( UNICODATA2 , "< UnicodeData-Latest.txt") 
   278    || die "cannot find UnicodeData-Latest.txt";
   279 ######################################################################
   280 #
   281 # Open the output file
   282 #
   283 ######################################################################
   284 open ( OUT , "> ../tables/transliterate.properties") 
   285   || die "cannot open output ../tables/transliterate.properties file";
   287 print OUT $header;
   289 # remove comments from $handcoded
   290 $handcoded =~ s/^#[^#].*\n//mg;
   291 print OUT $handcoded;
   293 ######################################################################
   294 #
   295 # Process the file line by line
   296 #
   297 ######################################################################
   298 while(<UNICODATA2>) {
   299    chop;
   300    @f = split(/;/ , $_); 
   301    $udec = hex($u);
   302    if(($udec > 256 ) && ($f[5] ne "")) {
   303      $table{$f[0]}=$f[5];
   304    }
   305 }
   306 while(<UNICODATA>) {
   307    chop;
   308    ######################################################################
   309    #
   310    # Get value from fields
   311    #
   312    ######################################################################
   313    @f = split(/;/ , $_); 
   314    $u = $f[0];    # The unicode value
   315    $cmt = $f[1];  # The comment
   316    $dec = $f[5];  # The decomposed value
   317    $d1 = $f[6];  
   318    $d2 = $f[7];  
   319    $d3 = $f[8];  
   320    $udec = hex($u);
   322    if($udec > 128) 
   323    {
   324      # not ASCII
   325      if($dec ne "") 
   326      {
   327        # have decomposition
   328        if($dec =~ /</)  {
   329            # formated decomposition
   330            if($dec =~ /<wide>/)  {
   331               output($u,$cmt,$udec,&decompose("<wide>", $dec));
   332            } elsif($dec =~ /<narrow>/)  {
   333               # ignore non ASCII decomposition
   334               # warning($_);
   335            } elsif($dec =~ /<circle>/)  {
   336               output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
   337            } elsif($dec =~ /<fraction>/)  {
   338               output($u,$cmt,$udec,&decompose("<fraction>", $dec));
   339            } elsif($dec =~ /<small>/)  {
   340               output($u,$cmt,$udec,&decompose("<small>", $dec));
   341            } elsif($dec =~ /<vertical>/)  {
   342               # warning($_);
   343            } elsif($dec =~ /<super>/)  {
   344               output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
   345            } elsif($dec =~ /<sub>/)  {
   346               output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
   347            } elsif($dec =~ /<font>/)  {
   348                output($u,$cmt,$udec,&decompose("<font>", $dec));
   349            } elsif($dec =~ /<square>/)  {
   350               # ignore <square>
   351               # warning($_);
   352            } elsif($dec =~ /<compat>/)  {
   353                output($u,$cmt,$udec,&decompose("<compat>", $dec));
   354            } elsif($dec =~ /<isolated>/)  {
   355               # ignore <isolated>
   356               # warning($_);
   357            } elsif($dec =~ /<medial>/)  {
   358               # ignore <medial>
   359               # warning($_);
   360            } elsif($dec =~ /<final>/)  {
   361               # ignore <final>
   362               # warning($_);
   363            } elsif($dec =~ /<initial>/)  {
   364               # ignore <initial>
   365               # warning($_);
   366            } elsif($dec =~ /<noBreak>/)  {
   367              if($dec eq "<noBreak> 0020")
   368              {
   369                output($u,$cmt,$udec,"\\u0020");
   370              } else {
   371               # ignore 
   372               # warning($_);
   373              }
   374            } else {
   375              warning($_);
   376            }
   377        } else {
   378          # decomposition without format code
   379          if($cmt =~ /LATIN/) {
   380            $dec = foldcombining($dec);
   381               output($u,$cmt,$udec,&decompose("", $dec));
   382          } elsif($cmt =~ /CYRILLIC/) {
   383               # ignore 
   384               # warning($_);
   385          } elsif($cmt =~ /GREEK/) {
   386               # ignore 
   387               # warning($_);
   388          } elsif($cmt =~ /ARABIC/) {
   389               # ignore 
   390               # warning($_);
   391          } elsif($cmt =~ /CJK/) {
   392               # ignore 
   393               # warning($_);
   394          } elsif($cmt =~ /HEBREW/) {
   395               # ignore 
   396               # warning($_);
   397          } elsif($cmt =~ /DEVANAGARI/) {
   398               # ignore 
   399               # warning($_);
   400          } elsif($cmt =~ /BENGALI/) {
   401               # ignore 
   402               # warning($_);
   403          } elsif($cmt =~ /GURMUKHI/) {
   404               # ignore 
   405               # warning($_);
   406          } elsif($cmt =~ /ORIYA/) {
   407               # ignore 
   408               # warning($_);
   409          } elsif($cmt =~ /TAMIL/) {
   410               # ignore 
   411               # warning($_);
   412          } elsif($cmt =~ /TELUGU/) {
   413               # ignore 
   414               # warning($_);
   415          } elsif($cmt =~ /KANNADA/) {
   416               # ignore 
   417               # warning($_);
   418          } elsif($cmt =~ /MALAYALAM/) {
   419               # ignore 
   420               # warning($_);
   421          } elsif($cmt =~ /SINHALA/) {
   422               # ignore 
   423               # warning($_);
   424          } elsif($cmt =~ /TIBETAN/) {
   425               # ignore 
   426               # warning($_);
   427          } elsif($cmt =~ /MYANMAR/) {
   428               # ignore 
   429               # warning($_);
   430          } elsif($cmt =~ /KATAKANA/) {
   431               # ignore 
   432               # warning($_);
   433          } elsif($cmt =~ /HIRAGANA/) {
   434               # ignore 
   435               # warning($_);
   436          } else {
   437               # ignore 
   438               # warning($_);
   439          }
   440        }
   441      } else {
   442        # do not have decomposition
   443        if ($d1 ne "") 
   444        {
   445          # are numeric characters
   446          output($u,$cmt,$udec,$d1);
   447        } elsif ($d2 ne "") {
   448          if($cmt =~ /CIRCLED/) {
   449            # circled
   450            output($u,$cmt,$udec,"(".$d2.")");
   451          } else {
   452            # others, use [ ]
   453            output($u,$cmt,$udec,"[".$d2."]");
   454          }
   455        } elsif ($d3 ne "") {
   456          if($cmt =~ /CIRCLED/) {
   457            # circled
   458            output($u,$cmt,$udec,"(".$d3.")");
   459          } else {
   460            # others, use [ ]
   461            output($u,$cmt,$udec,"[".$d3."]");
   462          }
   463        } else {
   464          # not numeric characters
   466        } # end of no decomposition
   467      } # end of have/not decomposition
   468    }
   469 }
   470 ######################################################################
   471 #
   472 # Close files
   473 #
   474 ######################################################################
   475 close(UNIDATA);
   476 close(OUT);

mercurial