|
1 #!/usr/local/bin/perl |
|
2 # -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
|
3 # |
|
4 # This Source Code Form is subject to the terms of the Mozilla Public |
|
5 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
6 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
7 %gb18030tounicode = {}; |
|
8 %unicodetogb18030 = {}; |
|
9 %unicodetocp936 = {}; |
|
10 %cp936tounicode = {}; |
|
11 %tounicodecommon = {}; |
|
12 %gb18030tounicodeuniq = {}; |
|
13 %gb180304btounicode = {}; |
|
14 %cp936tounicodeuniq = {}; |
|
15 |
|
16 %map = {}; |
|
17 $rowwidth = ((0xff - 0x80)+(0x7f - 0x40)); |
|
18 sub cp936tonum() |
|
19 { |
|
20 my($cp936) = (@_); |
|
21 my($first,$second,$jnum); |
|
22 $first = hex(substr($cp936,2,2)); |
|
23 $second = hex(substr($cp936,4,2)); |
|
24 $jnum = ($first - 0x81 ) * $rowwidth; |
|
25 if($second >= 0x80) |
|
26 { |
|
27 $jnum += $second - 0x80 + (0x7f-0x40); |
|
28 } |
|
29 else |
|
30 { |
|
31 $jnum += $second - 0x40; |
|
32 } |
|
33 return $jnum; |
|
34 } |
|
35 sub addeudc() |
|
36 { |
|
37 my($l,$h,$hl,$us); |
|
38 |
|
39 $u = 0xE000; |
|
40 $us = sprintf "%04X", $u; |
|
41 # For AAA1-AFFE |
|
42 for($h=0xAA; $h <=0xAF;$h++) |
|
43 { |
|
44 for($l=0xA1; $l <=0xFE;$l++,$u++) |
|
45 { |
|
46 $us = sprintf "%04X", $u; |
|
47 $hl = sprintf "%02X%02X", $h, $l; |
|
48 $unicodetocp936{$us} = $hl; |
|
49 } |
|
50 } |
|
51 |
|
52 # For F8A1-FEFE |
|
53 $us = sprintf "%04X", $u; |
|
54 for($h=0xF8; $h <=0xFE;$h++) |
|
55 { |
|
56 for($l=0xA1; $l <=0xFE;$l++,$u++) |
|
57 { |
|
58 $us = sprintf "%04X", $u; |
|
59 $hl = sprintf "%02X%02X", $h, $l; |
|
60 $unicodetocp936{$us} = $hl; |
|
61 } |
|
62 } |
|
63 |
|
64 # For A140-A7A0 |
|
65 $us = sprintf "%04X", $u; |
|
66 for($h=0xA1; $h <=0xA7;$h++) |
|
67 { |
|
68 for($l=0x40; $l <=0x7E;$l++,$u++) |
|
69 { |
|
70 $us = sprintf "%04X", $u; |
|
71 $hl = sprintf "%02X%02X", $h, $l; |
|
72 $unicodetocp936{$us} = $hl; |
|
73 } |
|
74 # We need to skip 7F |
|
75 for($l=0x80; $l <=0xA0;$l++,$u++) |
|
76 { |
|
77 $us = sprintf "%04X", $u; |
|
78 $hl = sprintf "%02X%02X", $h, $l; |
|
79 $unicodetocp936{$us} = $hl; |
|
80 } |
|
81 } |
|
82 } |
|
83 |
|
84 sub readcp936() |
|
85 { |
|
86 open(CP936, "<CP936.txt") || die "Cannot open CP936 file"; |
|
87 while(<CP936>) |
|
88 { |
|
89 if(! /^#/) { |
|
90 chop(); |
|
91 ($gb, $u) = split(/\t/, $_); |
|
92 if($u =~ /^0x/) { |
|
93 $u1 = substr($u, 2, 4); |
|
94 $gb1 = substr($gb, 2, 4); |
|
95 $cp936tounicode{$gb1} = $u1; |
|
96 if($unicodetocp936{$u1} == "") { |
|
97 $unicodetocp936{$u1} = $gb1; |
|
98 } else { |
|
99 "WARNING: Unicode " . $u1 . " already map to CP936 " . |
|
100 $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; |
|
101 } |
|
102 |
|
103 } |
|
104 } |
|
105 } |
|
106 } |
|
107 sub readgb18030() |
|
108 { |
|
109 open(GB18030, "<GB18030") || die "Cannot open GB18030 file"; |
|
110 while(<GB18030>) |
|
111 { |
|
112 if(/^[0-9A-F]/) { |
|
113 chop(); |
|
114 ($u, $gb) = split(/\s/, $_); |
|
115 $gb18030tounicode{$gb} = $u; |
|
116 if( $unicodetogb18030{$u} == "" ) { |
|
117 $unicodetogb18030{$u} = $gb; |
|
118 } else { |
|
119 "WARNING: Unicode " . $u1 . " already map to CP936 " . |
|
120 $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n"; |
|
121 } |
|
122 } |
|
123 } |
|
124 } |
|
125 sub splittable() |
|
126 { |
|
127 my($i, $u); |
|
128 for($i = 0; $i < 0x10000; $i++) { |
|
129 $u = sprintf "%04X", $i; |
|
130 if($unicodetogb18030{$u} eq $unicodetocp936{$u}) { |
|
131 if($unicodetogb18030{$u} ne "") { |
|
132 $tounicodecommon{$unicodetogb18030{$u}} = $u; |
|
133 } else { |
|
134 # print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n"; |
|
135 } |
|
136 } else { |
|
137 if($unicodetogb18030{$u} ne "" ) { |
|
138 if($unicodetogb18030{$u}.length > 4) { |
|
139 $gb180304btounicode{$unicodetogb18030{$u}} = $u; |
|
140 } else { |
|
141 $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u; |
|
142 } |
|
143 } |
|
144 if($unicodetocp936{$u} ne "" ) { |
|
145 $cp936tounicodeuniq{$unicodetocp936{$u}} = $u; |
|
146 } |
|
147 } |
|
148 } |
|
149 } |
|
150 sub gb4bytestoidx() |
|
151 { |
|
152 my($gb) = @_; |
|
153 my($b1,$b2, $b3, $b4,$idx); |
|
154 $b1 = hex(substr($gb, 0, 2)) - 0x81; |
|
155 $b2 = hex(substr($gb, 2, 2)) - 0x30; |
|
156 $b3 = hex(substr($gb, 4, 2)) - 0x81; |
|
157 $b4 = hex(substr($gb, 6, 2)) - 0x30; |
|
158 $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4; |
|
159 return $idx; |
|
160 } |
|
161 sub printcommontable() |
|
162 { |
|
163 open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt"; |
|
164 foreach $gb (sort(keys %tounicodecommon)) { |
|
165 print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n"; |
|
166 } |
|
167 close GBKCOMMON; |
|
168 } |
|
169 sub printcp936table() |
|
170 { |
|
171 open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt"; |
|
172 foreach $gb (sort(keys %cp936tounicodeuniq)) { |
|
173 print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n"; |
|
174 } |
|
175 close CP936UNIQ; |
|
176 } |
|
177 sub printgb180304btable() |
|
178 { |
|
179 open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt"; |
|
180 foreach $gb (sort(keys %gb180304btounicode)) { |
|
181 if($gb180304btounicode{$gb} ne "FFFF" ) { |
|
182 print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n"; |
|
183 } |
|
184 } |
|
185 close GB180304B; |
|
186 } |
|
187 sub printgb18030table() |
|
188 { |
|
189 open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt"; |
|
190 foreach $gb (sort(keys %gb18030tounicodeuniq)) { |
|
191 print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n"; |
|
192 } |
|
193 close GB18030UNIQ; |
|
194 } |
|
195 |
|
196 sub genufut() |
|
197 { |
|
198 print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n"); |
|
199 system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf"); |
|
200 |
|
201 print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n"); |
|
202 system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut"); |
|
203 |
|
204 print ( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf\n") ; |
|
205 system( "umaptable -uf < cp936uniq.txt > gbkuniq2b.uf") ; |
|
206 |
|
207 print ( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut\n") ; |
|
208 system( "umaptable -ut < cp936uniq.txt > gbkuniq2b.ut") ; |
|
209 |
|
210 print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n") ; |
|
211 system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf") ; |
|
212 |
|
213 print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n") ; |
|
214 system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut") ; |
|
215 |
|
216 print ( "perl cp936tocdx.pl > cp936map.h\n"); |
|
217 system( "perl cp936tocdx.pl > cp936map.h"); |
|
218 } |
|
219 |
|
220 &readgb18030(); |
|
221 &readcp936(); |
|
222 &addeudc(); |
|
223 &splittable(); |
|
224 &printcommontable(); |
|
225 &printgb180304btable(); |
|
226 &printgb18030table(); |
|
227 &printcp936table(); |
|
228 &genufut(); |