intl/uconv/tools/gengb18030tables.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/tools/gengb18030tables.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,228 @@
     1.4 +#!/usr/local/bin/perl
     1.5 +# -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     1.6 +#
     1.7 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
    1.10 +%gb18030tounicode = {};
    1.11 +%unicodetogb18030 = {};
    1.12 +%unicodetocp936 = {};
    1.13 +%cp936tounicode = {};
    1.14 +%tounicodecommon = {};
    1.15 +%gb18030tounicodeuniq = {};
    1.16 +%gb180304btounicode = {};
    1.17 +%cp936tounicodeuniq = {};
    1.18 +
    1.19 +%map = {};
    1.20 +$rowwidth = ((0xff - 0x80)+(0x7f - 0x40));
    1.21 +sub cp936tonum()
    1.22 +{
    1.23 +   my($cp936) = (@_);
    1.24 +   my($first,$second,$jnum);
    1.25 +   $first = hex(substr($cp936,2,2));
    1.26 +   $second = hex(substr($cp936,4,2));
    1.27 +   $jnum = ($first - 0x81 ) * $rowwidth;
    1.28 +   if($second >= 0x80)
    1.29 +   {
    1.30 +       $jnum += $second - 0x80 + (0x7f-0x40);
    1.31 +   }
    1.32 +   else
    1.33 +   {
    1.34 +       $jnum += $second - 0x40;
    1.35 +   }
    1.36 +   return $jnum;
    1.37 +}
    1.38 +sub addeudc()
    1.39 +{
    1.40 +  my($l,$h,$hl,$us);
    1.41 +
    1.42 +  $u = 0xE000;
    1.43 +  $us = sprintf "%04X", $u;
    1.44 +  # For AAA1-AFFE
    1.45 +  for($h=0xAA; $h <=0xAF;$h++)
    1.46 +  {
    1.47 +    for($l=0xA1; $l <=0xFE;$l++,$u++)
    1.48 +    {
    1.49 +        $us = sprintf "%04X", $u;
    1.50 +        $hl = sprintf "%02X%02X", $h, $l;
    1.51 +        $unicodetocp936{$us} = $hl;
    1.52 +    }
    1.53 +  }
    1.54 +
    1.55 +  # For F8A1-FEFE
    1.56 +  $us = sprintf "%04X", $u;
    1.57 +  for($h=0xF8; $h <=0xFE;$h++)
    1.58 +  {
    1.59 +    for($l=0xA1; $l <=0xFE;$l++,$u++)
    1.60 +    {
    1.61 +        $us = sprintf "%04X", $u;
    1.62 +        $hl = sprintf "%02X%02X", $h, $l;
    1.63 +        $unicodetocp936{$us} = $hl;
    1.64 +    }
    1.65 +  }
    1.66 +
    1.67 +  # For A140-A7A0
    1.68 +  $us = sprintf "%04X", $u;
    1.69 +  for($h=0xA1; $h <=0xA7;$h++)
    1.70 +  {
    1.71 +    for($l=0x40; $l <=0x7E;$l++,$u++)
    1.72 +    {
    1.73 +        $us = sprintf "%04X", $u;
    1.74 +        $hl = sprintf "%02X%02X", $h, $l;
    1.75 +        $unicodetocp936{$us} = $hl;
    1.76 +    }
    1.77 +    # We need to skip 7F
    1.78 +    for($l=0x80; $l <=0xA0;$l++,$u++)
    1.79 +    {
    1.80 +        $us = sprintf "%04X", $u;
    1.81 +        $hl = sprintf "%02X%02X", $h, $l;
    1.82 +        $unicodetocp936{$us} = $hl;
    1.83 +    }
    1.84 +  }
    1.85 +}
    1.86 +
    1.87 +sub readcp936()
    1.88 +{
    1.89 +  open(CP936, "<CP936.txt") || die "Cannot open CP936 file";
    1.90 +  while(<CP936>)
    1.91 +  {
    1.92 +    if(! /^#/) {
    1.93 +      chop();
    1.94 +      ($gb, $u) = split(/\t/, $_);
    1.95 +      if($u =~ /^0x/) {
    1.96 +        $u1 = substr($u, 2, 4);
    1.97 +        $gb1 = substr($gb, 2, 4);
    1.98 +        $cp936tounicode{$gb1} = $u1;
    1.99 +        if($unicodetocp936{$u1} == "") {
   1.100 +          $unicodetocp936{$u1} = $gb1;
   1.101 +        } else {
   1.102 +          "WARNING: Unicode " . $u1 . " already map to CP936 " . 
   1.103 +            $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
   1.104 +        }
   1.105 +
   1.106 +      }
   1.107 +    }
   1.108 +  }
   1.109 +}
   1.110 +sub readgb18030()
   1.111 +{
   1.112 +  open(GB18030, "<GB18030") || die "Cannot open GB18030 file";
   1.113 +  while(<GB18030>)
   1.114 +  {
   1.115 +    if(/^[0-9A-F]/) {
   1.116 +      chop();
   1.117 +      ($u, $gb) = split(/\s/, $_);
   1.118 +      $gb18030tounicode{$gb} = $u;
   1.119 +        if( $unicodetogb18030{$u} == "" ) {
   1.120 +          $unicodetogb18030{$u} = $gb;
   1.121 +        } else {
   1.122 +          "WARNING: Unicode " . $u1 . " already map to CP936 " . 
   1.123 +            $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
   1.124 +        }
   1.125 +    }
   1.126 +  }
   1.127 +}
   1.128 +sub splittable()
   1.129 +{
   1.130 +  my($i, $u);
   1.131 +  for($i = 0; $i < 0x10000; $i++) {
   1.132 +     $u = sprintf "%04X", $i;
   1.133 +     if($unicodetogb18030{$u} eq $unicodetocp936{$u}) {
   1.134 +        if($unicodetogb18030{$u} ne "") {
   1.135 +          $tounicodecommon{$unicodetogb18030{$u}} = $u;
   1.136 +        } else {
   1.137 +#          print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n";
   1.138 +        }
   1.139 +     } else {
   1.140 +        if($unicodetogb18030{$u} ne "" ) {
   1.141 +           if($unicodetogb18030{$u}.length > 4) {
   1.142 +             $gb180304btounicode{$unicodetogb18030{$u}} = $u;
   1.143 +           } else {
   1.144 +             $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u;
   1.145 +           }
   1.146 +        } 
   1.147 +        if($unicodetocp936{$u} ne "" ) {
   1.148 +           $cp936tounicodeuniq{$unicodetocp936{$u}} = $u;
   1.149 +        }
   1.150 +     }
   1.151 +  }
   1.152 +}
   1.153 +sub gb4bytestoidx()
   1.154 +{
   1.155 +  my($gb) = @_;
   1.156 +  my($b1,$b2, $b3, $b4,$idx);
   1.157 +  $b1 = hex(substr($gb, 0, 2)) - 0x81;
   1.158 +  $b2 = hex(substr($gb, 2, 2)) - 0x30;
   1.159 +  $b3 = hex(substr($gb, 4, 2)) - 0x81;
   1.160 +  $b4 = hex(substr($gb, 6, 2)) - 0x30;
   1.161 +  $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4;
   1.162 +  return $idx;
   1.163 +}
   1.164 +sub printcommontable()
   1.165 +{
   1.166 +  open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt";
   1.167 +  foreach $gb (sort(keys %tounicodecommon)) {
   1.168 +      print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n";
   1.169 +  }
   1.170 +  close GBKCOMMON;
   1.171 +}
   1.172 +sub printcp936table()
   1.173 +{
   1.174 +  open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt";
   1.175 +  foreach $gb (sort(keys %cp936tounicodeuniq)) {
   1.176 +      print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n";
   1.177 +  }
   1.178 +  close CP936UNIQ;
   1.179 +}
   1.180 +sub printgb180304btable()
   1.181 +{
   1.182 +  open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt";
   1.183 +  foreach $gb (sort(keys %gb180304btounicode)) {
   1.184 +      if($gb180304btounicode{$gb} ne "FFFF" ) {
   1.185 +        print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n";
   1.186 +      }
   1.187 +  }
   1.188 +  close GB180304B;
   1.189 +}
   1.190 +sub printgb18030table()
   1.191 +{
   1.192 +  open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt";
   1.193 +  foreach $gb (sort(keys %gb18030tounicodeuniq)) {
   1.194 +      print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n";
   1.195 +  }
   1.196 +  close GB18030UNIQ;
   1.197 +}
   1.198 +
   1.199 +sub genufut()
   1.200 +{
   1.201 + print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n");
   1.202 + system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf");
   1.203 +
   1.204 + print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n");
   1.205 + system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut");
   1.206 +
   1.207 + print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ;
   1.208 + system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ;
   1.209 +
   1.210 + print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ;
   1.211 + system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ;
   1.212 +
   1.213 + print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n")  ;
   1.214 + system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf")  ;
   1.215 +
   1.216 + print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n")  ;
   1.217 + system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut")  ;
   1.218 +
   1.219 + print ( "perl cp936tocdx.pl > cp936map.h\n");
   1.220 + system( "perl cp936tocdx.pl > cp936map.h");
   1.221 +}
   1.222 +
   1.223 +&readgb18030();
   1.224 +&readcp936();
   1.225 +&addeudc();
   1.226 +&splittable();
   1.227 +&printcommontable();
   1.228 +&printgb180304btable();
   1.229 +&printgb18030table();
   1.230 +&printcp936table();
   1.231 +&genufut();

mercurial