intl/uconv/tools/gengb18030tables.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 #!/usr/local/bin/perl
michael@0 2 # -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
michael@0 3 #
michael@0 4 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 5 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 7 %gb18030tounicode = {};
michael@0 8 %unicodetogb18030 = {};
michael@0 9 %unicodetocp936 = {};
michael@0 10 %cp936tounicode = {};
michael@0 11 %tounicodecommon = {};
michael@0 12 %gb18030tounicodeuniq = {};
michael@0 13 %gb180304btounicode = {};
michael@0 14 %cp936tounicodeuniq = {};
michael@0 15
michael@0 16 %map = {};
michael@0 17 $rowwidth = ((0xff - 0x80)+(0x7f - 0x40));
michael@0 18 sub cp936tonum()
michael@0 19 {
michael@0 20 my($cp936) = (@_);
michael@0 21 my($first,$second,$jnum);
michael@0 22 $first = hex(substr($cp936,2,2));
michael@0 23 $second = hex(substr($cp936,4,2));
michael@0 24 $jnum = ($first - 0x81 ) * $rowwidth;
michael@0 25 if($second >= 0x80)
michael@0 26 {
michael@0 27 $jnum += $second - 0x80 + (0x7f-0x40);
michael@0 28 }
michael@0 29 else
michael@0 30 {
michael@0 31 $jnum += $second - 0x40;
michael@0 32 }
michael@0 33 return $jnum;
michael@0 34 }
michael@0 35 sub addeudc()
michael@0 36 {
michael@0 37 my($l,$h,$hl,$us);
michael@0 38
michael@0 39 $u = 0xE000;
michael@0 40 $us = sprintf "%04X", $u;
michael@0 41 # For AAA1-AFFE
michael@0 42 for($h=0xAA; $h <=0xAF;$h++)
michael@0 43 {
michael@0 44 for($l=0xA1; $l <=0xFE;$l++,$u++)
michael@0 45 {
michael@0 46 $us = sprintf "%04X", $u;
michael@0 47 $hl = sprintf "%02X%02X", $h, $l;
michael@0 48 $unicodetocp936{$us} = $hl;
michael@0 49 }
michael@0 50 }
michael@0 51
michael@0 52 # For F8A1-FEFE
michael@0 53 $us = sprintf "%04X", $u;
michael@0 54 for($h=0xF8; $h <=0xFE;$h++)
michael@0 55 {
michael@0 56 for($l=0xA1; $l <=0xFE;$l++,$u++)
michael@0 57 {
michael@0 58 $us = sprintf "%04X", $u;
michael@0 59 $hl = sprintf "%02X%02X", $h, $l;
michael@0 60 $unicodetocp936{$us} = $hl;
michael@0 61 }
michael@0 62 }
michael@0 63
michael@0 64 # For A140-A7A0
michael@0 65 $us = sprintf "%04X", $u;
michael@0 66 for($h=0xA1; $h <=0xA7;$h++)
michael@0 67 {
michael@0 68 for($l=0x40; $l <=0x7E;$l++,$u++)
michael@0 69 {
michael@0 70 $us = sprintf "%04X", $u;
michael@0 71 $hl = sprintf "%02X%02X", $h, $l;
michael@0 72 $unicodetocp936{$us} = $hl;
michael@0 73 }
michael@0 74 # We need to skip 7F
michael@0 75 for($l=0x80; $l <=0xA0;$l++,$u++)
michael@0 76 {
michael@0 77 $us = sprintf "%04X", $u;
michael@0 78 $hl = sprintf "%02X%02X", $h, $l;
michael@0 79 $unicodetocp936{$us} = $hl;
michael@0 80 }
michael@0 81 }
michael@0 82 }
michael@0 83
michael@0 84 sub readcp936()
michael@0 85 {
michael@0 86 open(CP936, "<CP936.txt") || die "Cannot open CP936 file";
michael@0 87 while(<CP936>)
michael@0 88 {
michael@0 89 if(! /^#/) {
michael@0 90 chop();
michael@0 91 ($gb, $u) = split(/\t/, $_);
michael@0 92 if($u =~ /^0x/) {
michael@0 93 $u1 = substr($u, 2, 4);
michael@0 94 $gb1 = substr($gb, 2, 4);
michael@0 95 $cp936tounicode{$gb1} = $u1;
michael@0 96 if($unicodetocp936{$u1} == "") {
michael@0 97 $unicodetocp936{$u1} = $gb1;
michael@0 98 } else {
michael@0 99 "WARNING: Unicode " . $u1 . " already map to CP936 " .
michael@0 100 $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
michael@0 101 }
michael@0 102
michael@0 103 }
michael@0 104 }
michael@0 105 }
michael@0 106 }
michael@0 107 sub readgb18030()
michael@0 108 {
michael@0 109 open(GB18030, "<GB18030") || die "Cannot open GB18030 file";
michael@0 110 while(<GB18030>)
michael@0 111 {
michael@0 112 if(/^[0-9A-F]/) {
michael@0 113 chop();
michael@0 114 ($u, $gb) = split(/\s/, $_);
michael@0 115 $gb18030tounicode{$gb} = $u;
michael@0 116 if( $unicodetogb18030{$u} == "" ) {
michael@0 117 $unicodetogb18030{$u} = $gb;
michael@0 118 } else {
michael@0 119 "WARNING: Unicode " . $u1 . " already map to CP936 " .
michael@0 120 $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
michael@0 121 }
michael@0 122 }
michael@0 123 }
michael@0 124 }
michael@0 125 sub splittable()
michael@0 126 {
michael@0 127 my($i, $u);
michael@0 128 for($i = 0; $i < 0x10000; $i++) {
michael@0 129 $u = sprintf "%04X", $i;
michael@0 130 if($unicodetogb18030{$u} eq $unicodetocp936{$u}) {
michael@0 131 if($unicodetogb18030{$u} ne "") {
michael@0 132 $tounicodecommon{$unicodetogb18030{$u}} = $u;
michael@0 133 } else {
michael@0 134 # print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n";
michael@0 135 }
michael@0 136 } else {
michael@0 137 if($unicodetogb18030{$u} ne "" ) {
michael@0 138 if($unicodetogb18030{$u}.length > 4) {
michael@0 139 $gb180304btounicode{$unicodetogb18030{$u}} = $u;
michael@0 140 } else {
michael@0 141 $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u;
michael@0 142 }
michael@0 143 }
michael@0 144 if($unicodetocp936{$u} ne "" ) {
michael@0 145 $cp936tounicodeuniq{$unicodetocp936{$u}} = $u;
michael@0 146 }
michael@0 147 }
michael@0 148 }
michael@0 149 }
michael@0 150 sub gb4bytestoidx()
michael@0 151 {
michael@0 152 my($gb) = @_;
michael@0 153 my($b1,$b2, $b3, $b4,$idx);
michael@0 154 $b1 = hex(substr($gb, 0, 2)) - 0x81;
michael@0 155 $b2 = hex(substr($gb, 2, 2)) - 0x30;
michael@0 156 $b3 = hex(substr($gb, 4, 2)) - 0x81;
michael@0 157 $b4 = hex(substr($gb, 6, 2)) - 0x30;
michael@0 158 $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4;
michael@0 159 return $idx;
michael@0 160 }
michael@0 161 sub printcommontable()
michael@0 162 {
michael@0 163 open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt";
michael@0 164 foreach $gb (sort(keys %tounicodecommon)) {
michael@0 165 print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n";
michael@0 166 }
michael@0 167 close GBKCOMMON;
michael@0 168 }
michael@0 169 sub printcp936table()
michael@0 170 {
michael@0 171 open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt";
michael@0 172 foreach $gb (sort(keys %cp936tounicodeuniq)) {
michael@0 173 print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n";
michael@0 174 }
michael@0 175 close CP936UNIQ;
michael@0 176 }
michael@0 177 sub printgb180304btable()
michael@0 178 {
michael@0 179 open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt";
michael@0 180 foreach $gb (sort(keys %gb180304btounicode)) {
michael@0 181 if($gb180304btounicode{$gb} ne "FFFF" ) {
michael@0 182 print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n";
michael@0 183 }
michael@0 184 }
michael@0 185 close GB180304B;
michael@0 186 }
michael@0 187 sub printgb18030table()
michael@0 188 {
michael@0 189 open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt";
michael@0 190 foreach $gb (sort(keys %gb18030tounicodeuniq)) {
michael@0 191 print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n";
michael@0 192 }
michael@0 193 close GB18030UNIQ;
michael@0 194 }
michael@0 195
michael@0 196 sub genufut()
michael@0 197 {
michael@0 198 print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n");
michael@0 199 system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf");
michael@0 200
michael@0 201 print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n");
michael@0 202 system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut");
michael@0 203
michael@0 204 print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ;
michael@0 205 system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ;
michael@0 206
michael@0 207 print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ;
michael@0 208 system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ;
michael@0 209
michael@0 210 print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n") ;
michael@0 211 system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf") ;
michael@0 212
michael@0 213 print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n") ;
michael@0 214 system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut") ;
michael@0 215
michael@0 216 print ( "perl cp936tocdx.pl > cp936map.h\n");
michael@0 217 system( "perl cp936tocdx.pl > cp936map.h");
michael@0 218 }
michael@0 219
michael@0 220 &readgb18030();
michael@0 221 &readcp936();
michael@0 222 &addeudc();
michael@0 223 &splittable();
michael@0 224 &printcommontable();
michael@0 225 &printgb180304btable();
michael@0 226 &printgb18030table();
michael@0 227 &printcp936table();
michael@0 228 &genufut();

mercurial