|
1 #!/usr/bin/perl -w |
|
2 # |
|
3 # gen-big5hkscs-2001-mozilla.pl |
|
4 # a Perl script that generates Big5-HKSCS <-> Unicode |
|
5 # conversion tables for Mozilla |
|
6 # |
|
7 # Author (of the original Perl script): |
|
8 # Anthony Fok <anthony@thizlinux.com> <foka@debian.org> |
|
9 # Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. |
|
10 # License: GNU General Public License, v2 or later. |
|
11 # |
|
12 # This version includes original C source code from |
|
13 # glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com> |
|
14 # Roger So <roger.so@sw-linux.com> |
|
15 # |
|
16 # First attempt for Qt-2.3.x: 2001-09-21 |
|
17 # A working version for Qt-2.3.x: 2001-10-30 |
|
18 # Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21 |
|
19 # Adapted to generate conversion tables for Mozilla: 2002-11-26 |
|
20 # Adapted to generate conversion tables for Mozilla: 2002-11-30 |
|
21 # Cleaned up the script somewhat: 2002-12-04 |
|
22 # Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10 |
|
23 # |
|
24 # Notes: |
|
25 # |
|
26 # 1. The latest version of this script may be found in: |
|
27 # http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl |
|
28 # http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl |
|
29 # Or, better yet, e-mail me and ask for the latest version. |
|
30 # |
|
31 # 2. This script generates data from 3 tables: |
|
32 # a. http://www.microsoft.com/typography/unicode/950.txt |
|
33 # b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt |
|
34 # c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt |
|
35 # |
|
36 # Make sure your big5-iso.txt is the latest HKSCS-2001 version. |
|
37 # |
|
38 # 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into |
|
39 # different areas similar to the way Ulrich and Roger did it, |
|
40 # but extended for HKSCS-2001. |
|
41 # |
|
42 # 4. [Mozilla]: This script is very quick-and-dirty in some places. |
|
43 # Call either gen_mozilla_uf() or gen_mozilla_ut() to generate |
|
44 # the appropriate tables for feeding into "fromu" or "tou". |
|
45 # |
|
46 # 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized. |
|
47 # Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode. |
|
48 # Otherwise, this script would generate a HKSCS table. |
|
49 # (Yes, I know, I should clean up this script and make it more modular, |
|
50 # and with command-line options or whatnot. I'll do that later. :-) |
|
51 # |
|
52 # If you have any questions or concerns, please feel free to contact me |
|
53 # at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-) |
|
54 # |
|
55 # Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK) |
|
56 # for their generous support in this work. |
|
57 # |
|
58 |
|
59 # 1. UDA3, 0x8840 - 0x8dfe |
|
60 # 2. UDA2, 0x8e40 - 0xa0fe |
|
61 # 3. VDA, 0xc6a1 - 0xc8fe |
|
62 |
|
63 #use Getopt::Std; |
|
64 |
|
65 my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count ); |
|
66 |
|
67 my $debug = 0; |
|
68 my $hkscs_mode = 1; |
|
69 my $kangxi = 0; |
|
70 my $use_range = 0; |
|
71 my $bmp_only = 1; |
|
72 |
|
73 # |
|
74 # Subroutine Declaration |
|
75 # |
|
76 sub read_cp950(); |
|
77 sub adjust_radicals(); |
|
78 sub read_hkscs_main(); |
|
79 sub read_hkscs_cmp(); |
|
80 sub post_tuning(); |
|
81 sub gen_charmapml(); |
|
82 sub gen_check_b2u(); |
|
83 sub gen_check_u2b(); |
|
84 sub gen_mozilla_uf(); |
|
85 sub gen_mozilla_ut(); |
|
86 sub gen_glibc(); |
|
87 |
|
88 ########################################################################### |
|
89 # |
|
90 # Main program |
|
91 # |
|
92 |
|
93 # First, read Microsoft's CP950 as base Big5. |
|
94 read_cp950 (); |
|
95 |
|
96 # Add mappings to Kangxi Radicals. |
|
97 # The b2u direction is added only if $kangxi is not null. |
|
98 adjust_radicals (); |
|
99 |
|
100 # Then, read the HKSCS table. |
|
101 # Again, see the $hkscs_mode variable. |
|
102 read_hkscs_main (); |
|
103 read_hkscs_cmp () if $hkscs_mode; |
|
104 |
|
105 post_tuning (); |
|
106 |
|
107 |
|
108 # Then, choose one of the following: |
|
109 #gen_charmapml(); |
|
110 gen_mozilla_uf(); |
|
111 #gen_mozilla_ut(); |
|
112 #gen_check_u2b(); |
|
113 #gen_glibc(); |
|
114 |
|
115 |
|
116 # End of program |
|
117 exit 0; |
|
118 |
|
119 |
|
120 ############################################################################# |
|
121 # |
|
122 # Subroutines |
|
123 # |
|
124 |
|
125 sub read_cp950() { |
|
126 open( CP950, "950.txt" ) or die; |
|
127 my $mode = 0; |
|
128 while (<CP950>) { |
|
129 s/\r//; |
|
130 chomp; |
|
131 next if /^$/; |
|
132 last if /^ENDCODEPAGE/; |
|
133 |
|
134 if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) { |
|
135 $mode = 1; |
|
136 ( $count, $high ) = ( $1, $2 ); |
|
137 $i = 0; |
|
138 next; |
|
139 } |
|
140 if (/^WCTABLE (\d+)/) { |
|
141 $mode = 2; |
|
142 $count = $1; |
|
143 $i = 0; |
|
144 next; |
|
145 } |
|
146 next if $mode == 0; |
|
147 |
|
148 if ( $mode == 1 ) { |
|
149 ( $low, $unicode, $comment ) = split "\t"; |
|
150 $low =~ s/^0x//; |
|
151 $unicode =~ s/^0x//; |
|
152 $big5 = $high . $low; |
|
153 $b2u{ uc($big5) } = uc($unicode); |
|
154 if ( ++$i == $count ) { $mode = 0; $count = 0; next; } |
|
155 } |
|
156 |
|
157 if ( $mode == 2 ) { |
|
158 ( $unicode, $big5, $comment ) = split "\t"; |
|
159 $unicode =~ s/^0x//; |
|
160 $big5 =~ s/^0x//; |
|
161 my $u = hex($unicode); |
|
162 my $b = hex($big5); |
|
163 |
|
164 $u2b{ uc($unicode) } = uc($big5) unless |
|
165 |
|
166 # Skip Microsoft's over-generous (or over-zealous?) mappings |
|
167 # "Faked" accented latin characters |
|
168 ( $b <= 0xFF and $b != $u ) |
|
169 |
|
170 # "Faked" Ideographic Annotation ___ Mark |
|
171 or ( $u >= 0x3192 and $u <= 0x319F ) |
|
172 |
|
173 # "Faked" Parenthesized Ideograph ___ |
|
174 or ( $u >= 0x3220 and $u <= 0x3243 ) |
|
175 |
|
176 # "Faked" Circled Ideograph ___ except Circled Ideograph Correct |
|
177 or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 ) |
|
178 |
|
179 # ¢F¢G¢D¡¦£g¡M |
|
180 or ( $u == 0xA2 |
|
181 or $u == 0xA3 |
|
182 or $u == 0xA5 |
|
183 or $u == 0xB4 |
|
184 or $u == 0xB5 |
|
185 or $u == 0xB8 ) |
|
186 |
|
187 # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê |
|
188 or ( $u == 0x0305 # ??? |
|
189 or $u == 0x2015 |
|
190 or $u == 0x2016 |
|
191 or $u == 0x2022 |
|
192 or $u == 0x2024 |
|
193 or $u == 0x2033 |
|
194 or $u == 0x203E # ??? |
|
195 or $u == 0x2216 |
|
196 or $u == 0x2218 |
|
197 or $u == 0x2263 |
|
198 or $u == 0x2307 |
|
199 or $u == 0x2609 |
|
200 or $u == 0x2641 |
|
201 or $u == 0x301C |
|
202 or $u == 0x3030 ) |
|
203 |
|
204 # ¡s¡¥¡N |
|
205 or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 ); |
|
206 |
|
207 if ( ++$i == $count ) { $mode = 0; $count = 0; next; } |
|
208 } |
|
209 } |
|
210 } |
|
211 |
|
212 sub adjust_radicals() { |
|
213 |
|
214 # B5+C6BF - B5+C6D7: Radicals (?) |
|
215 |
|
216 # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible. |
|
217 # |
|
218 # Big5-HKSCS tends towards using the character in Unicode CJK Ideographs |
|
219 # Note that HKSCS does not explicitly define |
|
220 # B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×), |
|
221 # but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4, |
|
222 # mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively. |
|
223 # |
|
224 # As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5. |
|
225 # However, it also maps B5+FBF4 (ûô) to U+5E7A. |
|
226 $b2u{"C6BF"} = "2F02" if $kangxi; |
|
227 $u2b{"2F02"} = "C6BF"; # Æ¿ |
|
228 $b2u{"C6C0"} = "2F03" if $kangxi; |
|
229 $u2b{"2F03"} = "C6C0"; # ÆÀ |
|
230 $b2u{"C6C1"} = "2F05" if $kangxi; |
|
231 $u2b{"2F05"} = "C6C1"; # ÆÁ |
|
232 $b2u{"C6C2"} = "2F07" if $kangxi; |
|
233 $u2b{"2F07"} = "C6C2"; # ÆÂ |
|
234 $b2u{"C6C3"} = "2F0C" if $kangxi; |
|
235 $u2b{"2F0C"} = "C6C3"; # ÆÃ |
|
236 $b2u{"C6C4"} = "2F0D" if $kangxi; |
|
237 $u2b{"2F0D"} = "C6C4"; # ÆÄ |
|
238 $b2u{"C6C5"} = "2F0E" if $kangxi; |
|
239 $u2b{"2F0E"} = "C6C5"; # ÆÅ |
|
240 $b2u{"C6C6"} = "2F13" if $kangxi; |
|
241 $u2b{"2F13"} = "C6C6"; # ÆÆ |
|
242 $b2u{"C6C7"} = "2F16" if $kangxi; |
|
243 $u2b{"2F16"} = "C6C7"; # ÆÇ |
|
244 $b2u{"C6C8"} = "2F19" if $kangxi; |
|
245 $u2b{"2F19"} = "C6C8"; # ÆÈ |
|
246 $b2u{"C6C9"} = "2F1B" if $kangxi; |
|
247 $u2b{"2F1B"} = "C6C9"; # ÆÉ |
|
248 $b2u{"C6CA"} = "2F22" if $kangxi; |
|
249 $u2b{"2F22"} = "C6CA"; # ÆÊ |
|
250 $b2u{"C6CB"} = "2F27" if $kangxi; |
|
251 $u2b{"2F27"} = "C6CB"; # ÆË |
|
252 $b2u{"C6CC"} = "2F2E" if $kangxi; |
|
253 $u2b{"2F2E"} = "C6CC"; # ÆÌ |
|
254 $b2u{"C6CD"} = "2F33" if $kangxi; |
|
255 $u2b{"2F33"} = "C6CD"; # ÆÍ |
|
256 $b2u{"C6CE"} = "2F34" if $kangxi; |
|
257 $u2b{"2F34"} = "C6CE"; # ÆÎ |
|
258 $b2u{"C6CF"} = "2F35" if $kangxi; |
|
259 $u2b{"2F35"} = "C6CF"; # ÆÏ |
|
260 $b2u{"C6D0"} = "2F39" if $kangxi; |
|
261 $u2b{"2F39"} = "C6D0"; # ÆÐ |
|
262 $b2u{"C6D1"} = "2F3A" if $kangxi; |
|
263 $u2b{"2F3A"} = "C6D1"; # ÆÑ |
|
264 $b2u{"C6D2"} = "2F41" if $kangxi; |
|
265 $u2b{"2F41"} = "C6D2"; # ÆÒ |
|
266 $b2u{"C6D3"} = "2F46" if $kangxi; |
|
267 $u2b{"2F46"} = "C6D3"; # ÆÓ |
|
268 $b2u{"C6D4"} = "2F67" if $kangxi; |
|
269 $u2b{"2F67"} = "C6D4"; # ÆÔ |
|
270 $b2u{"C6D5"} = "2F68" if $kangxi; |
|
271 $u2b{"2F68"} = "C6D5"; # ÆÕ |
|
272 $b2u{"C6D6"} = "2FA1" if $kangxi; |
|
273 $u2b{"2FA1"} = "C6D6"; # ÆÖ |
|
274 $b2u{"C6D7"} = "2FAA" if $kangxi; |
|
275 $u2b{"2FAA"} = "C6D7"; # Æ× |
|
276 } |
|
277 |
|
278 sub read_hkscs_main() { |
|
279 |
|
280 open( B2U, "<big5-iso.txt" ) or die; |
|
281 while (<B2U>) { |
|
282 next |
|
283 unless |
|
284 /([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/; |
|
285 ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 ); |
|
286 |
|
287 my $b = hex($big5); |
|
288 |
|
289 # For non-HKSCS mode, only take data in the VDA range (?) |
|
290 next unless $hkscs_mode |
|
291 |
|
292 # Note that we don't go from B5+C6A1-B5+C6FE, but rather only |
|
293 # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals) |
|
294 # because C8D4-C8FE are not assigned in TW-BIG5 |
|
295 # if we are to follow Arphic PL Big-5 fonts. (To be discussed) |
|
296 or |
|
297 ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) ) |
|
298 or ( $b >= 0xF9D6 && $b <= 0xF9FE ); |
|
299 |
|
300 print STDERR |
|
301 "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n" |
|
302 if $debug |
|
303 and defined( $b2u{$big5} ) |
|
304 and $b2u{$big5} ne $iso2000; |
|
305 |
|
306 $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001 |
|
307 unless !$hkscs_mode |
|
308 and $b == 0xF9FE; |
|
309 |
|
310 # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to |
|
311 # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively. |
|
312 # Which is more correct? I don't know! (To be discussed) |
|
313 |
|
314 print STDERR |
|
315 "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n" |
|
316 if $debug |
|
317 and defined( $u2b{$iso1993} ) |
|
318 and $u2b{$iso1993} ne $big5; |
|
319 |
|
320 $u2b{$iso1993} = $big5; |
|
321 |
|
322 print STDERR |
|
323 "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n" |
|
324 if $debug |
|
325 and defined( $u2b{$iso2000} ) |
|
326 and $u2b{$iso2000} ne $big5; |
|
327 |
|
328 $u2b{$iso2000} = $big5; |
|
329 |
|
330 print STDERR |
|
331 "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n" |
|
332 if $debug |
|
333 and defined( $u2b{$iso2001} ) |
|
334 and $u2b{$iso2001} ne $big5; |
|
335 |
|
336 $u2b{$iso2001} = $big5; |
|
337 } |
|
338 close B2U; |
|
339 |
|
340 } # read_hkscs_main() |
|
341 |
|
342 |
|
343 sub read_hkscs_cmp() { |
|
344 |
|
345 ########################################################################### |
|
346 # Add Big5 compatibility coding... |
|
347 # |
|
348 # Stephan, here is the code segment that you may want to implement |
|
349 # in your convertbig5hkscs2001.pl |
|
350 # |
|
351 open( B5CMP, "<big5cmp.txt" ) or die; |
|
352 $mode = 0; |
|
353 while (<B5CMP>) { |
|
354 if (/^=====/) { $mode = 1; next; } |
|
355 next if $mode == 0; |
|
356 last if $mode == 1 and /^\s+/; |
|
357 chomp; |
|
358 my ( $big5cmp, $big5 ) = split " "; |
|
359 |
|
360 $big5cmp = uc($big5cmp); |
|
361 $big5 = uc($big5); |
|
362 my $uni = $b2u{$big5}; |
|
363 my $unicmp = $b2u{$big5cmp}; |
|
364 |
|
365 print STDERR |
|
366 "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t" |
|
367 if $debug; |
|
368 $b2u{$big5cmp} = $uni; |
|
369 $u2b{$unicmp} = $big5; |
|
370 print STDERR |
|
371 "Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n" |
|
372 if $debug; |
|
373 } |
|
374 close B5CMP; |
|
375 } # read_hkscs_cmp(); |
|
376 |
|
377 |
|
378 sub post_tuning() { |
|
379 |
|
380 # And finally, fine-tuning... |
|
381 for $i ( 0x00 .. 0x80 ) { |
|
382 $big5 = $unicode = sprintf( "%04X", $i ); |
|
383 $b2u{$big5} = $unicode; |
|
384 } |
|
385 |
|
386 # Add Euro '£á' (I wonder why this 950.txt doesn't have it.) |
|
387 $b2u{"A3E1"} = "20AC"; |
|
388 $u2b{"20AC"} = "A3E1"; |
|
389 |
|
390 # Box drawing characters: |
|
391 # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS) |
|
392 # (To be discussed) |
|
393 if ( !$hkscs_mode ) { |
|
394 $u2b{"2550"} = "A2A4"; # Big5: ¢¤ (also B5-F9F9) |
|
395 $u2b{"255E"} = "A2A5"; # Big5: ¢¥ (also B5-F9E9) |
|
396 $u2b{"2561"} = "A2A7"; # Big5: ¢§ (also B5-F9EB) |
|
397 $u2b{"256A"} = "A2A6"; # Big5: ¢¦ (also B5-F9EA) |
|
398 $u2b{"256D"} = "A27E"; # Big5: ¢~ (also B5-F9FA) |
|
399 $u2b{"256E"} = "A2A1"; # Big5: ¢¡ (also B5-F9FB) |
|
400 $u2b{"256F"} = "A2A3"; # Big5: ¢£ (also B5-F9FD) |
|
401 $u2b{"2570"} = "A2A2"; # Big5: ¢¢ (also B5-F9FC) |
|
402 } |
|
403 |
|
404 # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î) |
|
405 # (To be discussed) |
|
406 if ( !$hkscs_mode ) { |
|
407 $b2u{"A2CC"} = "3038"; |
|
408 $u2b{"3038"} = "A2CC"; |
|
409 $b2u{"A2CD"} = "3039"; |
|
410 $u2b{"3039"} = "A2CD"; |
|
411 $b2u{"A2CE"} = "303A"; |
|
412 $u2b{"303A"} = "A2CE"; |
|
413 } |
|
414 |
|
415 # The character for ethnic group "Yi" (ÂU): |
|
416 # (To be discussed) |
|
417 $u2b{"5F5E"} = "C255"; # Always add this. |
|
418 if ( !$hkscs_mode ) { |
|
419 $b2u{"C255"} = "5F5E"; |
|
420 } |
|
421 |
|
422 } # post_tuning() |
|
423 |
|
424 |
|
425 sub gen_charmapml() { |
|
426 |
|
427 ########################################################################### |
|
428 # |
|
429 # Codes for generating CharMapML XML file |
|
430 |
|
431 print <<EOT; |
|
432 <?xml version="1.0" encoding="UTF-8" ?> |
|
433 <!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd"> |
|
434 EOT |
|
435 |
|
436 if ($hkscs_mode) { |
|
437 print <<EOT; |
|
438 <characterMapping id="big5-hkscs-2001" version="1"> |
|
439 <history> |
|
440 <modified version="1" date="2002-11-30"> |
|
441 Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) |
|
442 with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, |
|
443 and with some other manual tweaking. |
|
444 </modified> |
|
445 </history> |
|
446 EOT |
|
447 } |
|
448 else { |
|
449 print <<EOT; |
|
450 <characterMapping id="tw-big5-2002" version="1"> |
|
451 <history> |
|
452 <modified version="1" date="2002-11-30"> |
|
453 Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) |
|
454 with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, |
|
455 and with some other manual tweaking. |
|
456 </modified> |
|
457 </history> |
|
458 EOT |
|
459 } |
|
460 |
|
461 print <<EOT; |
|
462 <validity> |
|
463 <state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/> |
|
464 <state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/> |
|
465 <state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/> |
|
466 <state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/> |
|
467 </validity> |
|
468 <assignments sub="3F"> |
|
469 EOT |
|
470 print " <!-- One to one mappings -->\n"; |
|
471 for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
|
472 $big5 = $u2b{$unicode}; |
|
473 $u = hex($unicode); |
|
474 next |
|
475 unless defined( $b2u{$big5} ) |
|
476 and $unicode eq $b2u{$big5} |
|
477 and |
|
478 not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 ); |
|
479 printf " <a u=\"%04X\" ", $u; |
|
480 if ( hex($big5) <= 0xFF ) { |
|
481 printf "b=\"%02X\"/>\n", hex($big5); |
|
482 } |
|
483 else { |
|
484 printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ), |
|
485 substr( $big5, 2, 2 ); |
|
486 } |
|
487 } |
|
488 |
|
489 print " <!-- Fallback mappings from Unicode to bytes -->\n"; |
|
490 for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
|
491 $big5 = $u2b{$unicode}; |
|
492 next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} ); |
|
493 if ( $unicode eq "F900" ) { |
|
494 print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n"; |
|
495 print |
|
496 " These are included in CP950 (Unicode->Big5 direction only).\n"; |
|
497 print " Should we include this area in TW-BIG5 or not? -->\n"; |
|
498 } |
|
499 printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), |
|
500 substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); |
|
501 } |
|
502 |
|
503 my %fbu; |
|
504 print " <!-- Fallback mappings from bytes to Unicode -->\n"; |
|
505 for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { |
|
506 $unicode = $b2u{$big5}; |
|
507 if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) ) |
|
508 { |
|
509 $fbu{$unicode} = $big5; |
|
510 } |
|
511 } |
|
512 for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) { |
|
513 $big5 = $fbu{$unicode}; |
|
514 printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), |
|
515 substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); |
|
516 } |
|
517 |
|
518 if ( $use_range and !$hkscs_mode ) { |
|
519 print <<EOT; |
|
520 <!-- Roundtrip-mappings that can be enumerated |
|
521 Note: We can only use the <range> tag for TW-BIG5. |
|
522 Big-5E and Big5-HKSCS have assigned characters in these areas, |
|
523 and we will have to use the <a> and <fub> tags instead. |
|
524 --> |
|
525 <!-- User-Defined Area 1 (UDA1) --> |
|
526 <range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/> |
|
527 <!-- User-Defined Area 2 (UDA2) --> |
|
528 <range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/> |
|
529 <!-- User-Defined Area 3 (UDA3) --> |
|
530 <range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/> |
|
531 EOT |
|
532 } |
|
533 |
|
534 print <<EOT; |
|
535 </assignments> |
|
536 </characterMapping> |
|
537 EOT |
|
538 |
|
539 } # gen_charmapml() |
|
540 |
|
541 sub gen_check_b2u() { |
|
542 |
|
543 ########################################################################### |
|
544 # |
|
545 # Codes for generating a raw table for verification and testing |
|
546 # |
|
547 # #print $u2b{"F7D1"}, "\n"; |
|
548 # print $b2u{$u2b{"F7D1"}}, "\n"; |
|
549 # print "FA59 -> U+", $b2u{"FA59"}, "\n"; |
|
550 |
|
551 foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { |
|
552 $unicode = $b2u{$big5}; |
|
553 $big5 =~ s/^00//; |
|
554 print "U+", $unicode, ": ", $big5, "\n"; |
|
555 } |
|
556 } |
|
557 |
|
558 sub gen_check_u2b() { |
|
559 foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
|
560 $big5 = $u2b{$unicode}; |
|
561 $big5 =~ s/^00//; |
|
562 print "U+", $unicode, ": ", $big5, "\n"; |
|
563 } |
|
564 |
|
565 } |
|
566 |
|
567 ########################################################################### |
|
568 # |
|
569 # Codes for generating hkscs.ut and hkscs.uf files for Mozilla |
|
570 # |
|
571 sub gen_mozilla_uf() { |
|
572 # hkscs.uf |
|
573 foreach $unicode ( sort keys %u2b ) { |
|
574 $big5 = $u2b{$unicode}; |
|
575 my $b = hex($big5); |
|
576 print "0x", uc($big5), "\t0x", uc($unicode), "\n" |
|
577 unless ( $b >= 0xA140 and $b <= 0xC6A0 ) |
|
578 or ( $b >= 0xC940 and $b <= 0xF9D5 ) |
|
579 or ( $b < 0x8140 ) |
|
580 or ( hex($unicode) > 0xFFFF ); |
|
581 } |
|
582 } |
|
583 |
|
584 sub gen_mozilla_ut() { |
|
585 # hkscs.ut |
|
586 foreach $big5 ( sort keys %b2u ) { |
|
587 my $b = hex($big5); |
|
588 print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n" |
|
589 unless ( $b >= 0xA140 and $b <= 0xC6A0 ) |
|
590 or ( $b < 0x8140 ) |
|
591 or ( $b >= 0xC940 and $b <= 0xF9D5 ); |
|
592 } |
|
593 } |
|
594 |
|
595 |
|
596 ########################################################################### |
|
597 |
|
598 sub gen_glibc() { |
|
599 |
|
600 ########################################################################## |
|
601 # |
|
602 # Generate index for UCS4 to Big5-HKSCS conversion table |
|
603 # |
|
604 @index_array = (); |
|
605 |
|
606 $mode = 0; |
|
607 $count = 0; |
|
608 for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) { |
|
609 $unicode = sprintf( "%04X", $uni ); |
|
610 |
|
611 # print " /* U+$unicode */\t" if $low % 4 == 0; |
|
612 if ( defined( $u2b{$unicode} ) ) { |
|
613 if ( $mode == 0 ) { |
|
614 $range_start = $range_end = $uni; |
|
615 |
|
616 # printf " { %7s, ", sprintf("0x%04X", $range_start); |
|
617 $mode = 1; |
|
618 } |
|
619 else { |
|
620 $range_end = $uni; |
|
621 } |
|
622 } |
|
623 elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) { |
|
624 |
|
625 # Start a new range if the gap is 0x80 or larger |
|
626 # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count; |
|
627 push @index_array, [ ( $range_start, $range_end, $count ) ]; |
|
628 $count += $range_end - $range_start + 1; |
|
629 $mode = 0; |
|
630 } |
|
631 } |
|
632 |
|
633 # |
|
634 # Note that $count and $range_end are used again as global variables |
|
635 # below |
|
636 # |
|
637 |
|
638 ########################################################################### |
|
639 # |
|
640 # Start generating real C code... |
|
641 # |
|
642 |
|
643 print <<'EOT'; |
|
644 /* Mapping tables for Big5-HKSCS handling. |
|
645 Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. |
|
646 This file is part of the GNU C Library. |
|
647 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. |
|
648 Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000. |
|
649 Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn> |
|
650 and Anthony Fok <anthony@thizlinux.com>, 2002 |
|
651 |
|
652 The GNU C Library is free software; you can redistribute it and/or |
|
653 modify it under the terms of the GNU Lesser General Public |
|
654 License as published by the Free Software Foundation; either |
|
655 version 2.1 of the License, or (at your option) any later version. |
|
656 |
|
657 The GNU C Library is distributed in the hope that it will be useful, |
|
658 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
659 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
660 Lesser General Public License for more details. |
|
661 |
|
662 You should have received a copy of the GNU Lesser General Public |
|
663 License along with the GNU C Library; if not, write to the Free |
|
664 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|
665 02111-1307 USA. */ |
|
666 |
|
667 #include <dlfcn.h> |
|
668 #include <gconv.h> |
|
669 #include <stdint.h> |
|
670 #include <stdlib.h> |
|
671 #include <string.h> |
|
672 #include <wchar.h> |
|
673 |
|
674 |
|
675 /* Table for Big5-HKSCS to UCS conversion. |
|
676 |
|
677 Original comments by Roger So when he updated the tables for HKSCS-1999: |
|
678 |
|
679 With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info: |
|
680 http://www.digital21.gov.hk/eng/hkscs/index.html |
|
681 - spacehunt 07/01/2000 |
|
682 |
|
683 The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt |
|
684 and big5cmp.txt using a Perl script while merging C source code from |
|
685 other developers. A copy of the source Perl script is available at: |
|
686 |
|
687 http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl |
|
688 http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl |
|
689 |
|
690 Revisions: |
|
691 2001-10-30 made codec for Qt |
|
692 2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001 |
|
693 |
|
694 Todo: |
|
695 Use a hash for characters beyond BMP to save space and make it |
|
696 more efficient |
|
697 |
|
698 - Anthony Fok <anthony@thizlinux.com> 21 Mar 2002 |
|
699 On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China |
|
700 */ |
|
701 |
|
702 EOT |
|
703 |
|
704 ########################################################################## |
|
705 # |
|
706 # Generate Big5-HKSCS to Unicode conversion table |
|
707 # |
|
708 |
|
709 ## print "Big5HKSCS to Unicode\n"; |
|
710 |
|
711 # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) { |
|
712 |
|
713 $high_start = 0x88; |
|
714 $high_end = 0xfe; |
|
715 |
|
716 print "static const uint16_t big5_hkscs_to_ucs["; |
|
717 print( ( $high_end - $high_start + 1 ) * 157 ); |
|
718 print "] =\n{\n"; |
|
719 for $high ( 0x88 .. 0xfe ) { |
|
720 for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) { |
|
721 if ( $low == 0x40 ) { |
|
722 print "\n" unless $high == $high_start; |
|
723 printf |
|
724 "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n", |
|
725 $high, $high, $high, $high; |
|
726 } |
|
727 elsif ( $low == 0xa1 ) { |
|
728 print "\t\t"; |
|
729 } |
|
730 $big5 = sprintf( "%02X%02X", $high, $low ); |
|
731 print "\t" if $low % 8 == 0; |
|
732 if ( defined( $b2u{$big5} ) ) { |
|
733 $unicode = $b2u{$big5}; |
|
734 print "0x", $unicode, ","; |
|
735 } |
|
736 else { |
|
737 print "0x0000,"; # for glibc |
|
738 } |
|
739 print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) |
|
740 ? "\n" |
|
741 : "\t" ); |
|
742 } |
|
743 } |
|
744 print "};\n\n"; |
|
745 |
|
746 ########################################################################## |
|
747 # |
|
748 # Generate Unicode to Big5-HKSCS conversion table |
|
749 # |
|
750 print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n"; |
|
751 foreach $index (@index_array) { |
|
752 ( $start, $end ) = ( @$index[0], @$index[1] ); |
|
753 printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 ); |
|
754 print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 ); |
|
755 for ( $i = $start ; $i <= $end ; $i++ ) { |
|
756 printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 ); |
|
757 $unicode = sprintf( "%04X", $i ); |
|
758 if ( defined( $big5 = $u2b{$unicode} ) ) { |
|
759 if ( $big5 =~ /^00/ ) { |
|
760 print '"\x', substr( $big5, 2, 2 ), '\x00",'; |
|
761 } |
|
762 else { |
|
763 print '"\x', substr( $big5, 0, 2 ), '\x', |
|
764 substr( $big5, 2, 2 ), '",'; |
|
765 } |
|
766 } |
|
767 else { |
|
768 print '"\x00\x00",'; |
|
769 } |
|
770 print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end; |
|
771 } |
|
772 print $end == $range_end ? "\n" : "\n\n"; |
|
773 } |
|
774 print "};\n\n"; |
|
775 |
|
776 ########################################################################### |
|
777 |
|
778 print <<EOT; |
|
779 static struct |
|
780 { |
|
781 /* Note: We are going to split this table so that we can use |
|
782 uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */ |
|
783 uint32_t from; |
|
784 uint32_t to; |
|
785 uint32_t offset; |
|
786 } from_ucs4_idx[] = |
|
787 { |
|
788 EOT |
|
789 foreach $index (@index_array) { |
|
790 printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ), |
|
791 sprintf( "0x%04X", @$index[1] ), @$index[2]; |
|
792 } |
|
793 print "};\n\n"; |
|
794 |
|
795 #foreach $i (sort keys %b2u) { |
|
796 # print $b2u{$i} . ' '; |
|
797 #} |
|
798 |
|
799 print <<'EOT'; |
|
800 /* Definitions used in the body of the `gconv' function. */ |
|
801 #define CHARSET_NAME "BIG5HKSCS//" |
|
802 #define FROM_LOOP from_big5 |
|
803 #define TO_LOOP to_big5 |
|
804 #define DEFINE_INIT 1 |
|
805 #define DEFINE_FINI 1 |
|
806 #define MIN_NEEDED_FROM 1 |
|
807 #define MAX_NEEDED_FROM 2 |
|
808 #define MIN_NEEDED_TO 4 |
|
809 |
|
810 |
|
811 /* First define the conversion function from Big5-HKSCS to UCS4. */ |
|
812 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
|
813 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
|
814 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
|
815 #define LOOPFCT FROM_LOOP |
|
816 #define BODY \ |
|
817 { \ |
|
818 uint32_t ch = *inptr; \ |
|
819 \ |
|
820 if (ch >= 0x81 && ch <= 0xfe) \ |
|
821 { \ |
|
822 /* Two-byte character. First test whether the next character \ |
|
823 is also available. */ \ |
|
824 uint32_t ch2; \ |
|
825 int idx; \ |
|
826 \ |
|
827 if (__builtin_expect (inptr + 1 >= inend, 0)) \ |
|
828 { \ |
|
829 /* The second character is not available. */ \ |
|
830 result = __GCONV_INCOMPLETE_INPUT; \ |
|
831 break; \ |
|
832 } \ |
|
833 \ |
|
834 ch2 = inptr[1]; \ |
|
835 /* See whether the second byte is in the correct range. */ \ |
|
836 if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \ |
|
837 { \ |
|
838 if (ch >= 0x88) \ |
|
839 { \ |
|
840 /* Look up the table */ \ |
|
841 idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \ |
|
842 if ((ch = big5_hkscs_to_ucs[idx]) == 0) \ |
|
843 { \ |
|
844 /* This is illegal. */ \ |
|
845 if (! ignore_errors_p ()) \ |
|
846 { \ |
|
847 result = __GCONV_ILLEGAL_INPUT; \ |
|
848 break; \ |
|
849 } \ |
|
850 \ |
|
851 ++inptr; \ |
|
852 ++*irreversible; \ |
|
853 continue; \ |
|
854 } \ |
|
855 } \ |
|
856 else \ |
|
857 { \ |
|
858 /* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \ |
|
859 ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \ |
|
860 + 0xeeb8; \ |
|
861 } \ |
|
862 } \ |
|
863 else \ |
|
864 { \ |
|
865 /* This is illegal. */ \ |
|
866 if (! ignore_errors_p ()) \ |
|
867 { \ |
|
868 result = __GCONV_ILLEGAL_INPUT; \ |
|
869 break; \ |
|
870 } \ |
|
871 \ |
|
872 ++inptr; \ |
|
873 ++*irreversible; \ |
|
874 continue; \ |
|
875 } \ |
|
876 \ |
|
877 inptr += 2; \ |
|
878 } \ |
|
879 else if (__builtin_expect (ch, 0) == 0xff) \ |
|
880 { \ |
|
881 result = __GCONV_ILLEGAL_INPUT; \ |
|
882 break; \ |
|
883 } \ |
|
884 else /* 0x00 to 0x80 */ \ |
|
885 ++inptr; \ |
|
886 \ |
|
887 put32 (outptr, ch); \ |
|
888 outptr += 4; \ |
|
889 } |
|
890 #define LOOP_NEED_FLAGS |
|
891 #include <iconv/loop.c> |
|
892 |
|
893 |
|
894 /* Next, define the other direction. */ |
|
895 #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
|
896 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
|
897 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM |
|
898 #define LOOPFCT TO_LOOP |
|
899 #define BODY \ |
|
900 { \ |
|
901 uint32_t ch = get32 (inptr); \ |
|
902 const unsigned char *cp = ""; \ |
|
903 unsigned char b5ch[2] = "\0\0"; \ |
|
904 int i; \ |
|
905 \ |
|
906 for (i = 0; \ |
|
907 i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \ |
|
908 ++i) \ |
|
909 { \ |
|
910 if (ch < from_ucs4_idx[i].from) \ |
|
911 break; \ |
|
912 if (from_ucs4_idx[i].to >= ch) \ |
|
913 { \ |
|
914 cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \ |
|
915 + ch - from_ucs4_idx[i].from]; \ |
|
916 break; \ |
|
917 } \ |
|
918 } \ |
|
919 \ |
|
920 if (ch <= 0x80) \ |
|
921 { \ |
|
922 b5ch[0] = ch; \ |
|
923 cp = b5ch; \ |
|
924 } \ |
|
925 \ |
|
926 if (cp[0] == '\0' && ch != 0) \ |
|
927 { \ |
|
928 UNICODE_TAG_HANDLER (ch, 4); \ |
|
929 \ |
|
930 /* Illegal character. */ \ |
|
931 STANDARD_ERR_HANDLER (4); \ |
|
932 } \ |
|
933 else \ |
|
934 { \ |
|
935 /* See whether there is enough room for the second byte we write. */ \ |
|
936 if (__builtin_expect (cp[1], '\1') != '\0' \ |
|
937 && __builtin_expect (outptr + 1 >= outend, 0)) \ |
|
938 { \ |
|
939 /* We have not enough room. */ \ |
|
940 result = __GCONV_FULL_OUTPUT; \ |
|
941 break; \ |
|
942 } \ |
|
943 \ |
|
944 *outptr++ = cp[0]; \ |
|
945 if (cp[1] != '\0') \ |
|
946 *outptr++ = cp[1]; \ |
|
947 } \ |
|
948 \ |
|
949 inptr += 4; \ |
|
950 } |
|
951 #define LOOP_NEED_FLAGS |
|
952 #include <iconv/loop.c> |
|
953 |
|
954 |
|
955 /* Now define the toplevel functions. */ |
|
956 #include <iconv/skeleton.c> |
|
957 EOT |
|
958 |
|
959 } |