The Tor Browser: intl/unicharutil/tools/gentransliterate.pl@fc2d59ddac77

     1 #!/usr/bin/perl

2 #

     3 # This Source Code Form is subject to the terms of the Mozilla Public

     4 # License, v. 2.0. If a copy of the MPL was not distributed with this

     5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

     7 $header = <<END_OF_HEADER;

     8 # This Source Code Form is subject to the terms of the Mozilla Public

     9 # License, v. 2.0. If a copy of the MPL was not distributed with this

    10 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

    12 #

    13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl

    14 # PLEASE DO NOT MODIFY THIS FILE BY HAND

    15 #

    16 entity.list.name=transliterate

    17 entity.169=(c)

    18 #

    19 #

    20 # Here are the windows-1252 characters from the range 0x80 - 0x9F

    21 #

    22 END_OF_HEADER

    24 $handcoded = <<END_OF_HANDCODED;

    25 # EURO SIGN

    26 entity.8364=EUR

    27 # SINGLE LOW-9 QUOTATION MARK

    28 entity.8218=,

    29 # LATIN SMALL LETTER F WITH HOOK

    30 entity.402=f

    31 # DOUBLE LOW-9 QUOTATION MARK

    32 entity.8222="

    33 # DAGGER

    34 entity.8224=+

    35 # DOUBLE DAGGER

    36 entity.8225=++

    37 # MODIFIER LETTER CIRCUMFLEX ACCENT

    38 entity.710=^

    39 # PER MILLE SIGN

    40 entity.8240=0/00

    41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK

    42 entity.8249=<

    43 # LATIN CAPITAL LIGATURE OE

    44 entity.338=OE

    45 # LEFT SINGLE QUOTATION MARK

    46 entity.8216='

    47 # RIGHT SINGLE QUOTATION MARK

    48 entity.8217='

    49 # LEFT DOUBLE QUOTATION MARK

    50 entity.8220="

    51 # RIGHT DOUBLE QUOTATION MARK

    52 entity.8221="

    53 # BULLET

    54 entity.8226=.

    55 # EN DASH

    56 entity.8211=--

    57 # EM DASH

    58 entity.8212=---

    59 # SMALL TILDE

    60 entity.732=~

    61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

    62 entity.8250=>

    63 # LATIN SMALL LIGATURE OE

    64 entity.339=oe

    65 # U+2000 EN QUAD

    66 entity.8192=\\u0020

    67 # U+2001 EM QUAD

    68 entity.8193=\\u0020

    69 # U+2010 HYPHEN

    70 entity.8208=-

    71 # U+2011 NON-BREAKING HYPHEN

    72 entity.8209=-

    73 # U+2012 FIGURE DASH

    74 entity.8210=-

    75 # U+2015 HORIZONTAL BAR

    76 entity.8213=--

    77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)

    78 entity.8203=

    79 # U+2061, ApplyFunction, character showing function application in presentation tagging

    80 entity.8289=

    81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark

    82 entity.8290=

    83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals

    84 entity.8518=d

    85 # U+2212, MINUS SIGN, official Unicode minus sign

    86 entity.8722=-

    87 # Hebrew punctuation

    88 # U+05BE HEBREW PUNCTUATION MAQAF

    89 entity.1470=-

    90 # U+05C0 HEBREW PUNCTUATION PASEQ

    91 entity.1472=|

    92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ

    93 entity.1475=:

    94 # U+05F3 HEBREW PUNCTUATION GERESH

    95 entity.1523='

    96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM

    97 entity.1524="

    98 ##

    99 ## End of hand coded section

   100 ## Below are generated from the unicode character database

   101 ##

   102 END_OF_HANDCODED

   104 @table = ();

   105 sub FromLatinComment

   106 {

   107   my ($cmt) = (@_);

   108   $char = "";

   109   if($cmt =~ /PRECEDED BY APOSTROPHE/) {

   110       $char = "\'";

   111   }

   112   if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {

   113       $char = $char . $1;

   114   }

   115   if($cmt =~ /SMALL LETTER ([A-Z]*)/) {

   116       $char = $char . lc($1);

   117   }

   118   @f = split(/ / , $cmt);

   119   while($item = shift @f) {

   120      if($item eq "DOT") {

   121        $char .= ".";

   122      } elsif ($item eq "DIAERESIS") {

   123        $char .= "\"";

   124      } elsif ($item eq "BREVE") {

   125        $char .= "(";

   126      } elsif ($item eq "ACUTE") {

   127        $char .= "\'";

   128      } elsif ($item eq "GRAVE") {

   129        $char .= "`";

   130      } elsif ($item eq "TILDE") {

   131        $char .= "~";

   132      } elsif ($item eq "CARON") {

   133        $char .= "(";

   134      } elsif ($item eq "HOOK") {

   135        $char .= "?";

   136      } elsif ($item eq "CEDILLA") {

   137        $char .= ",";

   138      } elsif ($item eq "MACRON") {

   139        $char .= "-";

   140      } elsif ($item eq "CIRCUMFLEX") {

   141        $char .= "^";

   142      } elsif ($item eq "RING") {

   143        $char .= "*";

   144      } elsif ($item eq "OGONEK") {

   145        $char .= ";";

   146      } elsif ($item eq "LINE") {

   147        $char .= "_";

   148      } elsif ($item eq "COMMA") {

   149        $char .= ",";

   150      } elsif ($item eq "STROKE") {

   151        $char .= "/";

   152      } elsif ($item eq "HORN") {

   153        $char .= "+";

   154      } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {

   155        # ignore

   156      } else {

   157        #print "AAAA $item\n";

   158      }

   159   }

   161   return $char;

   162 }

   163 sub warning

   164 {

   165   my ($warning) = (@_);

   166   print "WARNING: $warning \n";

   167 }

   168 sub doutput

   169 {

   170   my ($u, $cmt, $udec, $str) = (@_);

   171   # don't print out comments - for debugging purposes only

   172   # print "# U+$u $cmt\n";

   173   print "entity.$udec=$str\n";

   174 }

   175 sub output

   176 {

   177   my ($u, $cmt, $udec, $str) = (@_);

   178   if(decomposeIntoNonASCII($str)) {

   179     if(($cmt =~ "LATIN")  && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {

   180        $str = FromLatinComment($cmt);

   181        output($u,$cmt,$udec,$str);

   182     }

   183   } else {

   184     # don't print out comments - for debugging purposes only

   185     # print OUT "# U+$u $cmt\n";

   186     print OUT "entity.$udec=$str\n";

   187   }

   188 }

   190 sub decomposeIntoNonASCII

   191 {

   192   my ($dec) = (@_);

   193   return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;

   194 }

   196 sub foldcombining

   197 {

   198   my ($dec) = (@_);

   199   $grave = "0060";

   200   $acute = "0027";

   201   $hat = "005E";

   202   $hat = "005E";

   203   $tilde = "007E";

   204   $overscore = "002D"; ## should be 00AF but we can only handle ASCII now

   205   $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now

   206   $doubleacute = "0022";

   207   $dot = "002E";

   208   $doublegrave = "0060 0060";

   211   $dec =~ s/00A8/$umlaut/eg;

   212   $dec =~ s/00AF/$overscore/eg;

   213  # $dec =~ s/00B0//eg;

   214   $dec =~ s/00B4/$acute/eg;

   215   $dec =~ s/00B7/$dot/eg;

   216  # $dec =~ s/00B8//eg;

   217   $dec =~ s/0300/$grave/eg;

   218   $dec =~ s/0301/$acute/eg;

   219   $dec =~ s/0302/$hat/eg;

   220   $dec =~ s/0303/$tilde/eg;

   221   $dec =~ s/0304/$overscore/eg;

   222   $dec =~ s/0305/$overscore/eg;

   223  #$dec =~ s/0306/?/eg;

   224   $dec =~ s/0307/$dot/eg;

   225   $dec =~ s/0308/$umlaut/eg;

   226  #$dec =~ s/0309/?/eg;

   227  #$dec =~ s/030A/?/eg;

   228   $dec =~ s/030B/$doubleacute/eg;

   229  #$dec =~ s/030C/?/eg;

   230   $dec =~ s/030D/$acute/eg;

   231   $dec =~ s/030E/$doubleacute/eg;

   232   $dec =~ s/030F/$doublegrave/eg;

   234  # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others

   235   return $dec;

   236 }

   237 sub rdecompose

   238 {

   239   my ($dec) = (@_);

   240   if(exists $table{$dec}) {

   241     $t = $table{$dec};

   242     $t =~ s/<[a-zA-Z]*>//eg;

   243     $t = foldcombining($t);

   244     return rdecompose( $table{$t});

   245   }

   246   return $dec;

   247 }

   248 sub decompose

   249 {

   250   my ($removeprefix, $dec) = (@_);

   251   $removeprefix .= " ";

   253   $dec =~ s/$removeprefix//eg;

   254   if($dec eq "0020") {

   255    $dec = "\\u0020";

   256   } elsif($dec eq "005C") {

   257    $dec = "\\u005C";

   258   } else {

   259    $k = "\/";

   260    $dec =~ s/2044/$k/eg;

   261    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;

   262    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;

   263    $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;

   264    $dec =~ s/ //eg;

   265   }

   266   return $dec;

   267 }

   269 ######################################################################

   270 #

   271 # Open the unicode database file

   272 #

   273 ######################################################################

   274 open ( UNICODATA , "< UnicodeData-Latest.txt")

   275    || die "cannot find UnicodeData-Latest.txt";

   277 open ( UNICODATA2 , "< UnicodeData-Latest.txt")

   278    || die "cannot find UnicodeData-Latest.txt";

   279 ######################################################################

   280 #

   281 # Open the output file

   282 #

   283 ######################################################################

   284 open ( OUT , "> ../tables/transliterate.properties")

   285   || die "cannot open output ../tables/transliterate.properties file";

   287 print OUT $header;

   289 # remove comments from $handcoded

   290 $handcoded =~ s/^#[^#].*\n//mg;

   291 print OUT $handcoded;

   293 ######################################################################

   294 #

   295 # Process the file line by line

   296 #

   297 ######################################################################

   298 while(<UNICODATA2>) {

   299    chop;

   300    @f = split(/;/ , $_);

   301    $udec = hex($u);

   302    if(($udec > 256 ) && ($f[5] ne "")) {

   303      $table{$f[0]}=$f[5];

   304    }

   305 }

   306 while(<UNICODATA>) {

   307    chop;

   308    ######################################################################

   309    #

   310    # Get value from fields

   311    #

   312    ######################################################################

   313    @f = split(/;/ , $_);

   314    $u = $f[0];    # The unicode value

   315    $cmt = $f[1];  # The comment

   316    $dec = $f[5];  # The decomposed value

   317    $d1 = $f[6];

   318    $d2 = $f[7];

   319    $d3 = $f[8];

   320    $udec = hex($u);

   322    if($udec > 128)

   323    {

   324      # not ASCII

   325      if($dec ne "")

   326      {

   327        # have decomposition

   328        if($dec =~ /</)  {

   329            # formated decomposition

   330            if($dec =~ /<wide>/)  {

   331               output($u,$cmt,$udec,&decompose("<wide>", $dec));

   332            } elsif($dec =~ /<narrow>/)  {

   333               # ignore non ASCII decomposition

   334               # warning($_);

   335            } elsif($dec =~ /<circle>/)  {

   336               output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));

   337            } elsif($dec =~ /<fraction>/)  {

   338               output($u,$cmt,$udec,&decompose("<fraction>", $dec));

   339            } elsif($dec =~ /<small>/)  {

   340               output($u,$cmt,$udec,&decompose("<small>", $dec));

   341            } elsif($dec =~ /<vertical>/)  {

   342               # warning($_);

   343            } elsif($dec =~ /<super>/)  {

   344               output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");

   345            } elsif($dec =~ /<sub>/)  {

   346               output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");

   347            } elsif($dec =~ /<font>/)  {

   348                output($u,$cmt,$udec,&decompose("<font>", $dec));

   349            } elsif($dec =~ /<square>/)  {

   350               # ignore <square>

   351               # warning($_);

   352            } elsif($dec =~ /<compat>/)  {

   353                output($u,$cmt,$udec,&decompose("<compat>", $dec));

   354            } elsif($dec =~ /<isolated>/)  {

   355               # ignore <isolated>

   356               # warning($_);

   357            } elsif($dec =~ /<medial>/)  {

   358               # ignore <medial>

   359               # warning($_);

   360            } elsif($dec =~ /<final>/)  {

   361               # ignore <final>

   362               # warning($_);

   363            } elsif($dec =~ /<initial>/)  {

   364               # ignore <initial>

   365               # warning($_);

   366            } elsif($dec =~ /<noBreak>/)  {

   367              if($dec eq "<noBreak> 0020")

   368              {

   369                output($u,$cmt,$udec,"\\u0020");

   370              } else {

   371               # ignore

   372               # warning($_);

   373              }

   374            } else {

   375              warning($_);

   376            }

   377        } else {

   378          # decomposition without format code

   379          if($cmt =~ /LATIN/) {

   380            $dec = foldcombining($dec);

   381               output($u,$cmt,$udec,&decompose("", $dec));

   382          } elsif($cmt =~ /CYRILLIC/) {

   383               # ignore

   384               # warning($_);

   385          } elsif($cmt =~ /GREEK/) {

   386               # ignore

   387               # warning($_);

   388          } elsif($cmt =~ /ARABIC/) {

   389               # ignore

   390               # warning($_);

   391          } elsif($cmt =~ /CJK/) {

   392               # ignore

   393               # warning($_);

   394          } elsif($cmt =~ /HEBREW/) {

   395               # ignore

   396               # warning($_);

   397          } elsif($cmt =~ /DEVANAGARI/) {

   398               # ignore

   399               # warning($_);

   400          } elsif($cmt =~ /BENGALI/) {

   401               # ignore

   402               # warning($_);

   403          } elsif($cmt =~ /GURMUKHI/) {

   404               # ignore

   405               # warning($_);

   406          } elsif($cmt =~ /ORIYA/) {

   407               # ignore

   408               # warning($_);

   409          } elsif($cmt =~ /TAMIL/) {

   410               # ignore

   411               # warning($_);

   412          } elsif($cmt =~ /TELUGU/) {

   413               # ignore

   414               # warning($_);

   415          } elsif($cmt =~ /KANNADA/) {

   416               # ignore

   417               # warning($_);

   418          } elsif($cmt =~ /MALAYALAM/) {

   419               # ignore

   420               # warning($_);

   421          } elsif($cmt =~ /SINHALA/) {

   422               # ignore

   423               # warning($_);

   424          } elsif($cmt =~ /TIBETAN/) {

   425               # ignore

   426               # warning($_);

   427          } elsif($cmt =~ /MYANMAR/) {

   428               # ignore

   429               # warning($_);

   430          } elsif($cmt =~ /KATAKANA/) {

   431               # ignore

   432               # warning($_);

   433          } elsif($cmt =~ /HIRAGANA/) {

   434               # ignore

   435               # warning($_);

   436          } else {

   437               # ignore

   438               # warning($_);

   439          }

   440        }

   441      } else {

   442        # do not have decomposition

   443        if ($d1 ne "")

   444        {

   445          # are numeric characters

   446          output($u,$cmt,$udec,$d1);

   447        } elsif ($d2 ne "") {

   448          if($cmt =~ /CIRCLED/) {

   449            # circled

   450            output($u,$cmt,$udec,"(".$d2.")");

   451          } else {

   452            # others, use [ ]

   453            output($u,$cmt,$udec,"[".$d2."]");

   454          }

   455        } elsif ($d3 ne "") {

   456          if($cmt =~ /CIRCLED/) {

   457            # circled

   458            output($u,$cmt,$udec,"(".$d3.")");

   459          } else {

   460            # others, use [ ]

   461            output($u,$cmt,$udec,"[".$d3."]");

   462          }

   463        } else {

   464          # not numeric characters

   466        } # end of no decomposition

   467      } # end of have/not decomposition

   468    }

   469 }

   470 ######################################################################

   471 #

   472 # Close files

   473 #

   474 ######################################################################

   475 close(UNIDATA);

   476 close(OUT);

The Tor Browser / file revision

intl/unicharutil/tools/gentransliterate.pl@fc2d59ddac77

intl/unicharutil/tools/gentransliterate.pl