michael@0: #!/usr/local/bin/perl michael@0: # -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- michael@0: # michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: %gb18030tounicode = {}; michael@0: %unicodetogb18030 = {}; michael@0: %unicodetocp936 = {}; michael@0: %cp936tounicode = {}; michael@0: %tounicodecommon = {}; michael@0: %gb18030tounicodeuniq = {}; michael@0: %gb180304btounicode = {}; michael@0: %cp936tounicodeuniq = {}; michael@0: michael@0: %map = {}; michael@0: $rowwidth = ((0xff - 0x80)+(0x7f - 0x40)); michael@0: sub cp936tonum() michael@0: { michael@0: my($cp936) = (@_); michael@0: my($first,$second,$jnum); michael@0: $first = hex(substr($cp936,2,2)); michael@0: $second = hex(substr($cp936,4,2)); michael@0: $jnum = ($first - 0x81 ) * $rowwidth; michael@0: if($second >= 0x80) michael@0: { michael@0: $jnum += $second - 0x80 + (0x7f-0x40); michael@0: } michael@0: else michael@0: { michael@0: $jnum += $second - 0x40; michael@0: } michael@0: return $jnum; michael@0: } michael@0: sub addeudc() michael@0: { michael@0: my($l,$h,$hl,$us); michael@0: michael@0: $u = 0xE000; michael@0: $us = sprintf "%04X", $u; michael@0: # For AAA1-AFFE michael@0: for($h=0xAA; $h <=0xAF;$h++) michael@0: { michael@0: for($l=0xA1; $l <=0xFE;$l++,$u++) michael@0: { michael@0: $us = sprintf "%04X", $u; michael@0: $hl = sprintf "%02X%02X", $h, $l; michael@0: $unicodetocp936{$us} = $hl; michael@0: } michael@0: } michael@0: michael@0: # For F8A1-FEFE michael@0: $us = sprintf "%04X", $u; michael@0: for($h=0xF8; $h <=0xFE;$h++) michael@0: { michael@0: for($l=0xA1; $l <=0xFE;$l++,$u++) michael@0: { michael@0: $us = sprintf "%04X", $u; michael@0: $hl = sprintf "%02X%02X", $h, $l; michael@0: $unicodetocp936{$us} = $hl; michael@0: } michael@0: } michael@0: michael@0: # For A140-A7A0 michael@0: $us = sprintf "%04X", $u; michael@0: for($h=0xA1; $h <=0xA7;$h++) michael@0: { michael@0: for($l=0x40; $l <=0x7E;$l++,$u++) michael@0: { michael@0: $us = sprintf "%04X", $u; michael@0: $hl = sprintf "%02X%02X", $h, $l; michael@0: $unicodetocp936{$us} = $hl; michael@0: } michael@0: # We need to skip 7F michael@0: for($l=0x80; $l <=0xA0;$l++,$u++) michael@0: { michael@0: $us = sprintf "%04X", $u; michael@0: $hl = sprintf "%02X%02X", $h, $l; michael@0: $unicodetocp936{$us} = $hl; michael@0: } michael@0: } michael@0: } michael@0: michael@0: sub readcp936() michael@0: { michael@0: open(CP936, ") michael@0: { michael@0: if(! /^#/) { michael@0: chop(); michael@0: ($gb, $u) = split(/\t/, $_); michael@0: if($u =~ /^0x/) { michael@0: $u1 = substr($u, 2, 4); michael@0: $gb1 = substr($gb, 2, 4); michael@0: $cp936tounicode{$gb1} = $u1; michael@0: if($unicodetocp936{$u1} == "") { michael@0: $unicodetocp936{$u1} = $gb1; michael@0: } else { michael@0: "WARNING: Unicode " . $u1 . " already map to CP936 " . michael@0: $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; michael@0: } michael@0: michael@0: } michael@0: } michael@0: } michael@0: } michael@0: sub readgb18030() michael@0: { michael@0: open(GB18030, ") michael@0: { michael@0: if(/^[0-9A-F]/) { michael@0: chop(); michael@0: ($u, $gb) = split(/\s/, $_); michael@0: $gb18030tounicode{$gb} = $u; michael@0: if( $unicodetogb18030{$u} == "" ) { michael@0: $unicodetogb18030{$u} = $gb; michael@0: } else { michael@0: "WARNING: Unicode " . $u1 . " already map to CP936 " . michael@0: $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: sub splittable() michael@0: { michael@0: my($i, $u); michael@0: for($i = 0; $i < 0x10000; $i++) { michael@0: $u = sprintf "%04X", $i; michael@0: if($unicodetogb18030{$u} eq $unicodetocp936{$u}) { michael@0: if($unicodetogb18030{$u} ne "") { michael@0: $tounicodecommon{$unicodetogb18030{$u}} = $u; michael@0: } else { michael@0: # print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n"; michael@0: } michael@0: } else { michael@0: if($unicodetogb18030{$u} ne "" ) { michael@0: if($unicodetogb18030{$u}.length > 4) { michael@0: $gb180304btounicode{$unicodetogb18030{$u}} = $u; michael@0: } else { michael@0: $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u; michael@0: } michael@0: } michael@0: if($unicodetocp936{$u} ne "" ) { michael@0: $cp936tounicodeuniq{$unicodetocp936{$u}} = $u; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: sub gb4bytestoidx() michael@0: { michael@0: my($gb) = @_; michael@0: my($b1,$b2, $b3, $b4,$idx); michael@0: $b1 = hex(substr($gb, 0, 2)) - 0x81; michael@0: $b2 = hex(substr($gb, 2, 2)) - 0x30; michael@0: $b3 = hex(substr($gb, 4, 2)) - 0x81; michael@0: $b4 = hex(substr($gb, 6, 2)) - 0x30; michael@0: $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4; michael@0: return $idx; michael@0: } michael@0: sub printcommontable() michael@0: { michael@0: open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt"; michael@0: foreach $gb (sort(keys %tounicodecommon)) { michael@0: print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n"; michael@0: } michael@0: close GBKCOMMON; michael@0: } michael@0: sub printcp936table() michael@0: { michael@0: open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt"; michael@0: foreach $gb (sort(keys %cp936tounicodeuniq)) { michael@0: print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n"; michael@0: } michael@0: close CP936UNIQ; michael@0: } michael@0: sub printgb180304btable() michael@0: { michael@0: open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt"; michael@0: foreach $gb (sort(keys %gb180304btounicode)) { michael@0: if($gb180304btounicode{$gb} ne "FFFF" ) { michael@0: print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n"; michael@0: } michael@0: } michael@0: close GB180304B; michael@0: } michael@0: sub printgb18030table() michael@0: { michael@0: open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt"; michael@0: foreach $gb (sort(keys %gb18030tounicodeuniq)) { michael@0: print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n"; michael@0: } michael@0: close GB18030UNIQ; michael@0: } michael@0: michael@0: sub genufut() michael@0: { michael@0: print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n"); michael@0: system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf"); michael@0: michael@0: print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n"); michael@0: system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut"); michael@0: michael@0: print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ; michael@0: system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ; michael@0: michael@0: print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ; michael@0: system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ; michael@0: michael@0: print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n") ; michael@0: system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf") ; michael@0: michael@0: print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n") ; michael@0: system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut") ; michael@0: michael@0: print ( "perl cp936tocdx.pl > cp936map.h\n"); michael@0: system( "perl cp936tocdx.pl > cp936map.h"); michael@0: } michael@0: michael@0: &readgb18030(); michael@0: &readcp936(); michael@0: &addeudc(); michael@0: &splittable(); michael@0: &printcommontable(); michael@0: &printgb180304btable(); michael@0: &printgb18030table(); michael@0: &printcp936table(); michael@0: &genufut();