intl/uconv/tools/gengb18030tables.pl

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 #!/usr/local/bin/perl
     2 # -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     3 #
     4 # This Source Code Form is subject to the terms of the Mozilla Public
     5 # License, v. 2.0. If a copy of the MPL was not distributed with this
     6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     7 %gb18030tounicode = {};
     8 %unicodetogb18030 = {};
     9 %unicodetocp936 = {};
    10 %cp936tounicode = {};
    11 %tounicodecommon = {};
    12 %gb18030tounicodeuniq = {};
    13 %gb180304btounicode = {};
    14 %cp936tounicodeuniq = {};
    16 %map = {};
    17 $rowwidth = ((0xff - 0x80)+(0x7f - 0x40));
    18 sub cp936tonum()
    19 {
    20    my($cp936) = (@_);
    21    my($first,$second,$jnum);
    22    $first = hex(substr($cp936,2,2));
    23    $second = hex(substr($cp936,4,2));
    24    $jnum = ($first - 0x81 ) * $rowwidth;
    25    if($second >= 0x80)
    26    {
    27        $jnum += $second - 0x80 + (0x7f-0x40);
    28    }
    29    else
    30    {
    31        $jnum += $second - 0x40;
    32    }
    33    return $jnum;
    34 }
    35 sub addeudc()
    36 {
    37   my($l,$h,$hl,$us);
    39   $u = 0xE000;
    40   $us = sprintf "%04X", $u;
    41   # For AAA1-AFFE
    42   for($h=0xAA; $h <=0xAF;$h++)
    43   {
    44     for($l=0xA1; $l <=0xFE;$l++,$u++)
    45     {
    46         $us = sprintf "%04X", $u;
    47         $hl = sprintf "%02X%02X", $h, $l;
    48         $unicodetocp936{$us} = $hl;
    49     }
    50   }
    52   # For F8A1-FEFE
    53   $us = sprintf "%04X", $u;
    54   for($h=0xF8; $h <=0xFE;$h++)
    55   {
    56     for($l=0xA1; $l <=0xFE;$l++,$u++)
    57     {
    58         $us = sprintf "%04X", $u;
    59         $hl = sprintf "%02X%02X", $h, $l;
    60         $unicodetocp936{$us} = $hl;
    61     }
    62   }
    64   # For A140-A7A0
    65   $us = sprintf "%04X", $u;
    66   for($h=0xA1; $h <=0xA7;$h++)
    67   {
    68     for($l=0x40; $l <=0x7E;$l++,$u++)
    69     {
    70         $us = sprintf "%04X", $u;
    71         $hl = sprintf "%02X%02X", $h, $l;
    72         $unicodetocp936{$us} = $hl;
    73     }
    74     # We need to skip 7F
    75     for($l=0x80; $l <=0xA0;$l++,$u++)
    76     {
    77         $us = sprintf "%04X", $u;
    78         $hl = sprintf "%02X%02X", $h, $l;
    79         $unicodetocp936{$us} = $hl;
    80     }
    81   }
    82 }
    84 sub readcp936()
    85 {
    86   open(CP936, "<CP936.txt") || die "Cannot open CP936 file";
    87   while(<CP936>)
    88   {
    89     if(! /^#/) {
    90       chop();
    91       ($gb, $u) = split(/\t/, $_);
    92       if($u =~ /^0x/) {
    93         $u1 = substr($u, 2, 4);
    94         $gb1 = substr($gb, 2, 4);
    95         $cp936tounicode{$gb1} = $u1;
    96         if($unicodetocp936{$u1} == "") {
    97           $unicodetocp936{$u1} = $gb1;
    98         } else {
    99           "WARNING: Unicode " . $u1 . " already map to CP936 " . 
   100             $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
   101         }
   103       }
   104     }
   105   }
   106 }
   107 sub readgb18030()
   108 {
   109   open(GB18030, "<GB18030") || die "Cannot open GB18030 file";
   110   while(<GB18030>)
   111   {
   112     if(/^[0-9A-F]/) {
   113       chop();
   114       ($u, $gb) = split(/\s/, $_);
   115       $gb18030tounicode{$gb} = $u;
   116         if( $unicodetogb18030{$u} == "" ) {
   117           $unicodetogb18030{$u} = $gb;
   118         } else {
   119           "WARNING: Unicode " . $u1 . " already map to CP936 " . 
   120             $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
   121         }
   122     }
   123   }
   124 }
   125 sub splittable()
   126 {
   127   my($i, $u);
   128   for($i = 0; $i < 0x10000; $i++) {
   129      $u = sprintf "%04X", $i;
   130      if($unicodetogb18030{$u} eq $unicodetocp936{$u}) {
   131         if($unicodetogb18030{$u} ne "") {
   132           $tounicodecommon{$unicodetogb18030{$u}} = $u;
   133         } else {
   134 #          print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n";
   135         }
   136      } else {
   137         if($unicodetogb18030{$u} ne "" ) {
   138            if($unicodetogb18030{$u}.length > 4) {
   139              $gb180304btounicode{$unicodetogb18030{$u}} = $u;
   140            } else {
   141              $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u;
   142            }
   143         } 
   144         if($unicodetocp936{$u} ne "" ) {
   145            $cp936tounicodeuniq{$unicodetocp936{$u}} = $u;
   146         }
   147      }
   148   }
   149 }
   150 sub gb4bytestoidx()
   151 {
   152   my($gb) = @_;
   153   my($b1,$b2, $b3, $b4,$idx);
   154   $b1 = hex(substr($gb, 0, 2)) - 0x81;
   155   $b2 = hex(substr($gb, 2, 2)) - 0x30;
   156   $b3 = hex(substr($gb, 4, 2)) - 0x81;
   157   $b4 = hex(substr($gb, 6, 2)) - 0x30;
   158   $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4;
   159   return $idx;
   160 }
   161 sub printcommontable()
   162 {
   163   open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt";
   164   foreach $gb (sort(keys %tounicodecommon)) {
   165       print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n";
   166   }
   167   close GBKCOMMON;
   168 }
   169 sub printcp936table()
   170 {
   171   open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt";
   172   foreach $gb (sort(keys %cp936tounicodeuniq)) {
   173       print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n";
   174   }
   175   close CP936UNIQ;
   176 }
   177 sub printgb180304btable()
   178 {
   179   open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt";
   180   foreach $gb (sort(keys %gb180304btounicode)) {
   181       if($gb180304btounicode{$gb} ne "FFFF" ) {
   182         print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n";
   183       }
   184   }
   185   close GB180304B;
   186 }
   187 sub printgb18030table()
   188 {
   189   open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt";
   190   foreach $gb (sort(keys %gb18030tounicodeuniq)) {
   191       print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n";
   192   }
   193   close GB18030UNIQ;
   194 }
   196 sub genufut()
   197 {
   198  print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n");
   199  system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf");
   201  print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n");
   202  system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut");
   204  print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ;
   205  system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ;
   207  print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ;
   208  system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ;
   210  print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n")  ;
   211  system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf")  ;
   213  print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n")  ;
   214  system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut")  ;
   216  print ( "perl cp936tocdx.pl > cp936map.h\n");
   217  system( "perl cp936tocdx.pl > cp936map.h");
   218 }
   220 &readgb18030();
   221 &readcp936();
   222 &addeudc();
   223 &splittable();
   224 &printcommontable();
   225 &printgb180304btable();
   226 &printgb18030table();
   227 &printcp936table();
   228 &genufut();

mercurial