1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/tools/gengb18030tables.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,228 @@ 1.4 +#!/usr/local/bin/perl 1.5 +# -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- 1.6 +# 1.7 +# This Source Code Form is subject to the terms of the Mozilla Public 1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.10 +%gb18030tounicode = {}; 1.11 +%unicodetogb18030 = {}; 1.12 +%unicodetocp936 = {}; 1.13 +%cp936tounicode = {}; 1.14 +%tounicodecommon = {}; 1.15 +%gb18030tounicodeuniq = {}; 1.16 +%gb180304btounicode = {}; 1.17 +%cp936tounicodeuniq = {}; 1.18 + 1.19 +%map = {}; 1.20 +$rowwidth = ((0xff - 0x80)+(0x7f - 0x40)); 1.21 +sub cp936tonum() 1.22 +{ 1.23 + my($cp936) = (@_); 1.24 + my($first,$second,$jnum); 1.25 + $first = hex(substr($cp936,2,2)); 1.26 + $second = hex(substr($cp936,4,2)); 1.27 + $jnum = ($first - 0x81 ) * $rowwidth; 1.28 + if($second >= 0x80) 1.29 + { 1.30 + $jnum += $second - 0x80 + (0x7f-0x40); 1.31 + } 1.32 + else 1.33 + { 1.34 + $jnum += $second - 0x40; 1.35 + } 1.36 + return $jnum; 1.37 +} 1.38 +sub addeudc() 1.39 +{ 1.40 + my($l,$h,$hl,$us); 1.41 + 1.42 + $u = 0xE000; 1.43 + $us = sprintf "%04X", $u; 1.44 + # For AAA1-AFFE 1.45 + for($h=0xAA; $h <=0xAF;$h++) 1.46 + { 1.47 + for($l=0xA1; $l <=0xFE;$l++,$u++) 1.48 + { 1.49 + $us = sprintf "%04X", $u; 1.50 + $hl = sprintf "%02X%02X", $h, $l; 1.51 + $unicodetocp936{$us} = $hl; 1.52 + } 1.53 + } 1.54 + 1.55 + # For F8A1-FEFE 1.56 + $us = sprintf "%04X", $u; 1.57 + for($h=0xF8; $h <=0xFE;$h++) 1.58 + { 1.59 + for($l=0xA1; $l <=0xFE;$l++,$u++) 1.60 + { 1.61 + $us = sprintf "%04X", $u; 1.62 + $hl = sprintf "%02X%02X", $h, $l; 1.63 + $unicodetocp936{$us} = $hl; 1.64 + } 1.65 + } 1.66 + 1.67 + # For A140-A7A0 1.68 + $us = sprintf "%04X", $u; 1.69 + for($h=0xA1; $h <=0xA7;$h++) 1.70 + { 1.71 + for($l=0x40; $l <=0x7E;$l++,$u++) 1.72 + { 1.73 + $us = sprintf "%04X", $u; 1.74 + $hl = sprintf "%02X%02X", $h, $l; 1.75 + $unicodetocp936{$us} = $hl; 1.76 + } 1.77 + # We need to skip 7F 1.78 + for($l=0x80; $l <=0xA0;$l++,$u++) 1.79 + { 1.80 + $us = sprintf "%04X", $u; 1.81 + $hl = sprintf "%02X%02X", $h, $l; 1.82 + $unicodetocp936{$us} = $hl; 1.83 + } 1.84 + } 1.85 +} 1.86 + 1.87 +sub readcp936() 1.88 +{ 1.89 + open(CP936, "<CP936.txt") || die "Cannot open CP936 file"; 1.90 + while(<CP936>) 1.91 + { 1.92 + if(! /^#/) { 1.93 + chop(); 1.94 + ($gb, $u) = split(/\t/, $_); 1.95 + if($u =~ /^0x/) { 1.96 + $u1 = substr($u, 2, 4); 1.97 + $gb1 = substr($gb, 2, 4); 1.98 + $cp936tounicode{$gb1} = $u1; 1.99 + if($unicodetocp936{$u1} == "") { 1.100 + $unicodetocp936{$u1} = $gb1; 1.101 + } else { 1.102 + "WARNING: Unicode " . $u1 . " already map to CP936 " . 1.103 + $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; 1.104 + } 1.105 + 1.106 + } 1.107 + } 1.108 + } 1.109 +} 1.110 +sub readgb18030() 1.111 +{ 1.112 + open(GB18030, "<GB18030") || die "Cannot open GB18030 file"; 1.113 + while(<GB18030>) 1.114 + { 1.115 + if(/^[0-9A-F]/) { 1.116 + chop(); 1.117 + ($u, $gb) = split(/\s/, $_); 1.118 + $gb18030tounicode{$gb} = $u; 1.119 + if( $unicodetogb18030{$u} == "" ) { 1.120 + $unicodetogb18030{$u} = $gb; 1.121 + } else { 1.122 + "WARNING: Unicode " . $u1 . " already map to CP936 " . 1.123 + $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; 1.124 + } 1.125 + } 1.126 + } 1.127 +} 1.128 +sub splittable() 1.129 +{ 1.130 + my($i, $u); 1.131 + for($i = 0; $i < 0x10000; $i++) { 1.132 + $u = sprintf "%04X", $i; 1.133 + if($unicodetogb18030{$u} eq $unicodetocp936{$u}) { 1.134 + if($unicodetogb18030{$u} ne "") { 1.135 + $tounicodecommon{$unicodetogb18030{$u}} = $u; 1.136 + } else { 1.137 +# print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n"; 1.138 + } 1.139 + } else { 1.140 + if($unicodetogb18030{$u} ne "" ) { 1.141 + if($unicodetogb18030{$u}.length > 4) { 1.142 + $gb180304btounicode{$unicodetogb18030{$u}} = $u; 1.143 + } else { 1.144 + $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u; 1.145 + } 1.146 + } 1.147 + if($unicodetocp936{$u} ne "" ) { 1.148 + $cp936tounicodeuniq{$unicodetocp936{$u}} = $u; 1.149 + } 1.150 + } 1.151 + } 1.152 +} 1.153 +sub gb4bytestoidx() 1.154 +{ 1.155 + my($gb) = @_; 1.156 + my($b1,$b2, $b3, $b4,$idx); 1.157 + $b1 = hex(substr($gb, 0, 2)) - 0x81; 1.158 + $b2 = hex(substr($gb, 2, 2)) - 0x30; 1.159 + $b3 = hex(substr($gb, 4, 2)) - 0x81; 1.160 + $b4 = hex(substr($gb, 6, 2)) - 0x30; 1.161 + $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4; 1.162 + return $idx; 1.163 +} 1.164 +sub printcommontable() 1.165 +{ 1.166 + open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt"; 1.167 + foreach $gb (sort(keys %tounicodecommon)) { 1.168 + print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n"; 1.169 + } 1.170 + close GBKCOMMON; 1.171 +} 1.172 +sub printcp936table() 1.173 +{ 1.174 + open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt"; 1.175 + foreach $gb (sort(keys %cp936tounicodeuniq)) { 1.176 + print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n"; 1.177 + } 1.178 + close CP936UNIQ; 1.179 +} 1.180 +sub printgb180304btable() 1.181 +{ 1.182 + open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt"; 1.183 + foreach $gb (sort(keys %gb180304btounicode)) { 1.184 + if($gb180304btounicode{$gb} ne "FFFF" ) { 1.185 + print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n"; 1.186 + } 1.187 + } 1.188 + close GB180304B; 1.189 +} 1.190 +sub printgb18030table() 1.191 +{ 1.192 + open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt"; 1.193 + foreach $gb (sort(keys %gb18030tounicodeuniq)) { 1.194 + print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n"; 1.195 + } 1.196 + close GB18030UNIQ; 1.197 +} 1.198 + 1.199 +sub genufut() 1.200 +{ 1.201 + print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n"); 1.202 + system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf"); 1.203 + 1.204 + print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n"); 1.205 + system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut"); 1.206 + 1.207 + print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ; 1.208 + system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ; 1.209 + 1.210 + print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ; 1.211 + system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ; 1.212 + 1.213 + print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n") ; 1.214 + system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf") ; 1.215 + 1.216 + print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n") ; 1.217 + system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut") ; 1.218 + 1.219 + print ( "perl cp936tocdx.pl > cp936map.h\n"); 1.220 + system( "perl cp936tocdx.pl > cp936map.h"); 1.221 +} 1.222 + 1.223 +&readgb18030(); 1.224 +&readcp936(); 1.225 +&addeudc(); 1.226 +&splittable(); 1.227 +&printcommontable(); 1.228 +&printgb180304btable(); 1.229 +&printgb18030table(); 1.230 +&printcp936table(); 1.231 +&genufut();