1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,959 @@ 1.4 +#!/usr/bin/perl -w 1.5 +# 1.6 +# gen-big5hkscs-2001-mozilla.pl 1.7 +# a Perl script that generates Big5-HKSCS <-> Unicode 1.8 +# conversion tables for Mozilla 1.9 +# 1.10 +# Author (of the original Perl script): 1.11 +# Anthony Fok <anthony@thizlinux.com> <foka@debian.org> 1.12 +# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. 1.13 +# License: GNU General Public License, v2 or later. 1.14 +# 1.15 +# This version includes original C source code from 1.16 +# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com> 1.17 +# Roger So <roger.so@sw-linux.com> 1.18 +# 1.19 +# First attempt for Qt-2.3.x: 2001-09-21 1.20 +# A working version for Qt-2.3.x: 2001-10-30 1.21 +# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21 1.22 +# Adapted to generate conversion tables for Mozilla: 2002-11-26 1.23 +# Adapted to generate conversion tables for Mozilla: 2002-11-30 1.24 +# Cleaned up the script somewhat: 2002-12-04 1.25 +# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10 1.26 +# 1.27 +# Notes: 1.28 +# 1.29 +# 1. The latest version of this script may be found in: 1.30 +# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl 1.31 +# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl 1.32 +# Or, better yet, e-mail me and ask for the latest version. 1.33 +# 1.34 +# 2. This script generates data from 3 tables: 1.35 +# a. http://www.microsoft.com/typography/unicode/950.txt 1.36 +# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt 1.37 +# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt 1.38 +# 1.39 +# Make sure your big5-iso.txt is the latest HKSCS-2001 version. 1.40 +# 1.41 +# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into 1.42 +# different areas similar to the way Ulrich and Roger did it, 1.43 +# but extended for HKSCS-2001. 1.44 +# 1.45 +# 4. [Mozilla]: This script is very quick-and-dirty in some places. 1.46 +# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate 1.47 +# the appropriate tables for feeding into "fromu" or "tou". 1.48 +# 1.49 +# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized. 1.50 +# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode. 1.51 +# Otherwise, this script would generate a HKSCS table. 1.52 +# (Yes, I know, I should clean up this script and make it more modular, 1.53 +# and with command-line options or whatnot. I'll do that later. :-) 1.54 +# 1.55 +# If you have any questions or concerns, please feel free to contact me 1.56 +# at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-) 1.57 +# 1.58 +# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK) 1.59 +# for their generous support in this work. 1.60 +# 1.61 + 1.62 +# 1. UDA3, 0x8840 - 0x8dfe 1.63 +# 2. UDA2, 0x8e40 - 0xa0fe 1.64 +# 3. VDA, 0xc6a1 - 0xc8fe 1.65 + 1.66 +#use Getopt::Std; 1.67 + 1.68 +my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count ); 1.69 + 1.70 +my $debug = 0; 1.71 +my $hkscs_mode = 1; 1.72 +my $kangxi = 0; 1.73 +my $use_range = 0; 1.74 +my $bmp_only = 1; 1.75 + 1.76 +# 1.77 +# Subroutine Declaration 1.78 +# 1.79 +sub read_cp950(); 1.80 +sub adjust_radicals(); 1.81 +sub read_hkscs_main(); 1.82 +sub read_hkscs_cmp(); 1.83 +sub post_tuning(); 1.84 +sub gen_charmapml(); 1.85 +sub gen_check_b2u(); 1.86 +sub gen_check_u2b(); 1.87 +sub gen_mozilla_uf(); 1.88 +sub gen_mozilla_ut(); 1.89 +sub gen_glibc(); 1.90 + 1.91 +########################################################################### 1.92 +# 1.93 +# Main program 1.94 +# 1.95 + 1.96 +# First, read Microsoft's CP950 as base Big5. 1.97 +read_cp950 (); 1.98 + 1.99 +# Add mappings to Kangxi Radicals. 1.100 +# The b2u direction is added only if $kangxi is not null. 1.101 +adjust_radicals (); 1.102 + 1.103 +# Then, read the HKSCS table. 1.104 +# Again, see the $hkscs_mode variable. 1.105 +read_hkscs_main (); 1.106 +read_hkscs_cmp () if $hkscs_mode; 1.107 + 1.108 +post_tuning (); 1.109 + 1.110 + 1.111 +# Then, choose one of the following: 1.112 +#gen_charmapml(); 1.113 +gen_mozilla_uf(); 1.114 +#gen_mozilla_ut(); 1.115 +#gen_check_u2b(); 1.116 +#gen_glibc(); 1.117 + 1.118 + 1.119 +# End of program 1.120 +exit 0; 1.121 + 1.122 + 1.123 +############################################################################# 1.124 +# 1.125 +# Subroutines 1.126 +# 1.127 + 1.128 +sub read_cp950() { 1.129 + open( CP950, "950.txt" ) or die; 1.130 + my $mode = 0; 1.131 + while (<CP950>) { 1.132 + s/\r//; 1.133 + chomp; 1.134 + next if /^$/; 1.135 + last if /^ENDCODEPAGE/; 1.136 + 1.137 + if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) { 1.138 + $mode = 1; 1.139 + ( $count, $high ) = ( $1, $2 ); 1.140 + $i = 0; 1.141 + next; 1.142 + } 1.143 + if (/^WCTABLE (\d+)/) { 1.144 + $mode = 2; 1.145 + $count = $1; 1.146 + $i = 0; 1.147 + next; 1.148 + } 1.149 + next if $mode == 0; 1.150 + 1.151 + if ( $mode == 1 ) { 1.152 + ( $low, $unicode, $comment ) = split "\t"; 1.153 + $low =~ s/^0x//; 1.154 + $unicode =~ s/^0x//; 1.155 + $big5 = $high . $low; 1.156 + $b2u{ uc($big5) } = uc($unicode); 1.157 + if ( ++$i == $count ) { $mode = 0; $count = 0; next; } 1.158 + } 1.159 + 1.160 + if ( $mode == 2 ) { 1.161 + ( $unicode, $big5, $comment ) = split "\t"; 1.162 + $unicode =~ s/^0x//; 1.163 + $big5 =~ s/^0x//; 1.164 + my $u = hex($unicode); 1.165 + my $b = hex($big5); 1.166 + 1.167 + $u2b{ uc($unicode) } = uc($big5) unless 1.168 + 1.169 + # Skip Microsoft's over-generous (or over-zealous?) mappings 1.170 + # "Faked" accented latin characters 1.171 + ( $b <= 0xFF and $b != $u ) 1.172 + 1.173 + # "Faked" Ideographic Annotation ___ Mark 1.174 + or ( $u >= 0x3192 and $u <= 0x319F ) 1.175 + 1.176 + # "Faked" Parenthesized Ideograph ___ 1.177 + or ( $u >= 0x3220 and $u <= 0x3243 ) 1.178 + 1.179 + # "Faked" Circled Ideograph ___ except Circled Ideograph Correct 1.180 + or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 ) 1.181 + 1.182 + # ¢F¢G¢D¡¦£g¡M 1.183 + or ( $u == 0xA2 1.184 + or $u == 0xA3 1.185 + or $u == 0xA5 1.186 + or $u == 0xB4 1.187 + or $u == 0xB5 1.188 + or $u == 0xB8 ) 1.189 + 1.190 + # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê 1.191 + or ( $u == 0x0305 # ??? 1.192 + or $u == 0x2015 1.193 + or $u == 0x2016 1.194 + or $u == 0x2022 1.195 + or $u == 0x2024 1.196 + or $u == 0x2033 1.197 + or $u == 0x203E # ??? 1.198 + or $u == 0x2216 1.199 + or $u == 0x2218 1.200 + or $u == 0x2263 1.201 + or $u == 0x2307 1.202 + or $u == 0x2609 1.203 + or $u == 0x2641 1.204 + or $u == 0x301C 1.205 + or $u == 0x3030 ) 1.206 + 1.207 + # ¡s¡¥¡N 1.208 + or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 ); 1.209 + 1.210 + if ( ++$i == $count ) { $mode = 0; $count = 0; next; } 1.211 + } 1.212 + } 1.213 +} 1.214 + 1.215 +sub adjust_radicals() { 1.216 + 1.217 + # B5+C6BF - B5+C6D7: Radicals (?) 1.218 + 1.219 + # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible. 1.220 + # 1.221 + # Big5-HKSCS tends towards using the character in Unicode CJK Ideographs 1.222 + # Note that HKSCS does not explicitly define 1.223 + # B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×), 1.224 + # but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4, 1.225 + # mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively. 1.226 + # 1.227 + # As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5. 1.228 + # However, it also maps B5+FBF4 (ûô) to U+5E7A. 1.229 + $b2u{"C6BF"} = "2F02" if $kangxi; 1.230 + $u2b{"2F02"} = "C6BF"; # Æ¿ 1.231 + $b2u{"C6C0"} = "2F03" if $kangxi; 1.232 + $u2b{"2F03"} = "C6C0"; # ÆÀ 1.233 + $b2u{"C6C1"} = "2F05" if $kangxi; 1.234 + $u2b{"2F05"} = "C6C1"; # ÆÁ 1.235 + $b2u{"C6C2"} = "2F07" if $kangxi; 1.236 + $u2b{"2F07"} = "C6C2"; # ÆÂ 1.237 + $b2u{"C6C3"} = "2F0C" if $kangxi; 1.238 + $u2b{"2F0C"} = "C6C3"; # ÆÃ 1.239 + $b2u{"C6C4"} = "2F0D" if $kangxi; 1.240 + $u2b{"2F0D"} = "C6C4"; # ÆÄ 1.241 + $b2u{"C6C5"} = "2F0E" if $kangxi; 1.242 + $u2b{"2F0E"} = "C6C5"; # ÆÅ 1.243 + $b2u{"C6C6"} = "2F13" if $kangxi; 1.244 + $u2b{"2F13"} = "C6C6"; # ÆÆ 1.245 + $b2u{"C6C7"} = "2F16" if $kangxi; 1.246 + $u2b{"2F16"} = "C6C7"; # ÆÇ 1.247 + $b2u{"C6C8"} = "2F19" if $kangxi; 1.248 + $u2b{"2F19"} = "C6C8"; # ÆÈ 1.249 + $b2u{"C6C9"} = "2F1B" if $kangxi; 1.250 + $u2b{"2F1B"} = "C6C9"; # ÆÉ 1.251 + $b2u{"C6CA"} = "2F22" if $kangxi; 1.252 + $u2b{"2F22"} = "C6CA"; # ÆÊ 1.253 + $b2u{"C6CB"} = "2F27" if $kangxi; 1.254 + $u2b{"2F27"} = "C6CB"; # ÆË 1.255 + $b2u{"C6CC"} = "2F2E" if $kangxi; 1.256 + $u2b{"2F2E"} = "C6CC"; # ÆÌ 1.257 + $b2u{"C6CD"} = "2F33" if $kangxi; 1.258 + $u2b{"2F33"} = "C6CD"; # ÆÍ 1.259 + $b2u{"C6CE"} = "2F34" if $kangxi; 1.260 + $u2b{"2F34"} = "C6CE"; # ÆÎ 1.261 + $b2u{"C6CF"} = "2F35" if $kangxi; 1.262 + $u2b{"2F35"} = "C6CF"; # ÆÏ 1.263 + $b2u{"C6D0"} = "2F39" if $kangxi; 1.264 + $u2b{"2F39"} = "C6D0"; # ÆÐ 1.265 + $b2u{"C6D1"} = "2F3A" if $kangxi; 1.266 + $u2b{"2F3A"} = "C6D1"; # ÆÑ 1.267 + $b2u{"C6D2"} = "2F41" if $kangxi; 1.268 + $u2b{"2F41"} = "C6D2"; # ÆÒ 1.269 + $b2u{"C6D3"} = "2F46" if $kangxi; 1.270 + $u2b{"2F46"} = "C6D3"; # ÆÓ 1.271 + $b2u{"C6D4"} = "2F67" if $kangxi; 1.272 + $u2b{"2F67"} = "C6D4"; # ÆÔ 1.273 + $b2u{"C6D5"} = "2F68" if $kangxi; 1.274 + $u2b{"2F68"} = "C6D5"; # ÆÕ 1.275 + $b2u{"C6D6"} = "2FA1" if $kangxi; 1.276 + $u2b{"2FA1"} = "C6D6"; # ÆÖ 1.277 + $b2u{"C6D7"} = "2FAA" if $kangxi; 1.278 + $u2b{"2FAA"} = "C6D7"; # Æ× 1.279 +} 1.280 + 1.281 +sub read_hkscs_main() { 1.282 + 1.283 + open( B2U, "<big5-iso.txt" ) or die; 1.284 + while (<B2U>) { 1.285 + next 1.286 + unless 1.287 +/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/; 1.288 + ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 ); 1.289 + 1.290 + my $b = hex($big5); 1.291 + 1.292 + # For non-HKSCS mode, only take data in the VDA range (?) 1.293 + next unless $hkscs_mode 1.294 + 1.295 + # Note that we don't go from B5+C6A1-B5+C6FE, but rather only 1.296 + # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals) 1.297 + # because C8D4-C8FE are not assigned in TW-BIG5 1.298 + # if we are to follow Arphic PL Big-5 fonts. (To be discussed) 1.299 + or 1.300 + ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) ) 1.301 + or ( $b >= 0xF9D6 && $b <= 0xF9FE ); 1.302 + 1.303 + print STDERR 1.304 + "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n" 1.305 + if $debug 1.306 + and defined( $b2u{$big5} ) 1.307 + and $b2u{$big5} ne $iso2000; 1.308 + 1.309 + $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001 1.310 + unless !$hkscs_mode 1.311 + and $b == 0xF9FE; 1.312 + 1.313 + # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to 1.314 + # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively. 1.315 + # Which is more correct? I don't know! (To be discussed) 1.316 + 1.317 + print STDERR 1.318 + "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n" 1.319 + if $debug 1.320 + and defined( $u2b{$iso1993} ) 1.321 + and $u2b{$iso1993} ne $big5; 1.322 + 1.323 + $u2b{$iso1993} = $big5; 1.324 + 1.325 + print STDERR 1.326 + "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n" 1.327 + if $debug 1.328 + and defined( $u2b{$iso2000} ) 1.329 + and $u2b{$iso2000} ne $big5; 1.330 + 1.331 + $u2b{$iso2000} = $big5; 1.332 + 1.333 + print STDERR 1.334 + "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n" 1.335 + if $debug 1.336 + and defined( $u2b{$iso2001} ) 1.337 + and $u2b{$iso2001} ne $big5; 1.338 + 1.339 + $u2b{$iso2001} = $big5; 1.340 + } 1.341 + close B2U; 1.342 + 1.343 +} # read_hkscs_main() 1.344 + 1.345 + 1.346 +sub read_hkscs_cmp() { 1.347 + 1.348 + ########################################################################### 1.349 + # Add Big5 compatibility coding... 1.350 + # 1.351 + # Stephan, here is the code segment that you may want to implement 1.352 + # in your convertbig5hkscs2001.pl 1.353 + # 1.354 + open( B5CMP, "<big5cmp.txt" ) or die; 1.355 + $mode = 0; 1.356 + while (<B5CMP>) { 1.357 + if (/^=====/) { $mode = 1; next; } 1.358 + next if $mode == 0; 1.359 + last if $mode == 1 and /^\s+/; 1.360 + chomp; 1.361 + my ( $big5cmp, $big5 ) = split " "; 1.362 + 1.363 + $big5cmp = uc($big5cmp); 1.364 + $big5 = uc($big5); 1.365 + my $uni = $b2u{$big5}; 1.366 + my $unicmp = $b2u{$big5cmp}; 1.367 + 1.368 + print STDERR 1.369 + "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t" 1.370 + if $debug; 1.371 + $b2u{$big5cmp} = $uni; 1.372 + $u2b{$unicmp} = $big5; 1.373 + print STDERR 1.374 + "Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n" 1.375 + if $debug; 1.376 + } 1.377 + close B5CMP; 1.378 +} # read_hkscs_cmp(); 1.379 + 1.380 + 1.381 +sub post_tuning() { 1.382 + 1.383 + # And finally, fine-tuning... 1.384 + for $i ( 0x00 .. 0x80 ) { 1.385 + $big5 = $unicode = sprintf( "%04X", $i ); 1.386 + $b2u{$big5} = $unicode; 1.387 + } 1.388 + 1.389 + # Add Euro '£á' (I wonder why this 950.txt doesn't have it.) 1.390 + $b2u{"A3E1"} = "20AC"; 1.391 + $u2b{"20AC"} = "A3E1"; 1.392 + 1.393 + # Box drawing characters: 1.394 + # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS) 1.395 + # (To be discussed) 1.396 + if ( !$hkscs_mode ) { 1.397 + $u2b{"2550"} = "A2A4"; # Big5: ¢¤ (also B5-F9F9) 1.398 + $u2b{"255E"} = "A2A5"; # Big5: ¢¥ (also B5-F9E9) 1.399 + $u2b{"2561"} = "A2A7"; # Big5: ¢§ (also B5-F9EB) 1.400 + $u2b{"256A"} = "A2A6"; # Big5: ¢¦ (also B5-F9EA) 1.401 + $u2b{"256D"} = "A27E"; # Big5: ¢~ (also B5-F9FA) 1.402 + $u2b{"256E"} = "A2A1"; # Big5: ¢¡ (also B5-F9FB) 1.403 + $u2b{"256F"} = "A2A3"; # Big5: ¢£ (also B5-F9FD) 1.404 + $u2b{"2570"} = "A2A2"; # Big5: ¢¢ (also B5-F9FC) 1.405 + } 1.406 + 1.407 + # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î) 1.408 + # (To be discussed) 1.409 + if ( !$hkscs_mode ) { 1.410 + $b2u{"A2CC"} = "3038"; 1.411 + $u2b{"3038"} = "A2CC"; 1.412 + $b2u{"A2CD"} = "3039"; 1.413 + $u2b{"3039"} = "A2CD"; 1.414 + $b2u{"A2CE"} = "303A"; 1.415 + $u2b{"303A"} = "A2CE"; 1.416 + } 1.417 + 1.418 + # The character for ethnic group "Yi" (ÂU): 1.419 + # (To be discussed) 1.420 + $u2b{"5F5E"} = "C255"; # Always add this. 1.421 + if ( !$hkscs_mode ) { 1.422 + $b2u{"C255"} = "5F5E"; 1.423 + } 1.424 + 1.425 +} # post_tuning() 1.426 + 1.427 + 1.428 +sub gen_charmapml() { 1.429 + 1.430 + ########################################################################### 1.431 + # 1.432 + # Codes for generating CharMapML XML file 1.433 + 1.434 + print <<EOT; 1.435 +<?xml version="1.0" encoding="UTF-8" ?> 1.436 +<!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd"> 1.437 +EOT 1.438 + 1.439 + if ($hkscs_mode) { 1.440 + print <<EOT; 1.441 +<characterMapping id="big5-hkscs-2001" version="1"> 1.442 + <history> 1.443 + <modified version="1" date="2002-11-30"> 1.444 + Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) 1.445 + with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, 1.446 + and with some other manual tweaking. 1.447 + </modified> 1.448 + </history> 1.449 +EOT 1.450 + } 1.451 + else { 1.452 + print <<EOT; 1.453 +<characterMapping id="tw-big5-2002" version="1"> 1.454 + <history> 1.455 + <modified version="1" date="2002-11-30"> 1.456 + Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) 1.457 + with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, 1.458 + and with some other manual tweaking. 1.459 + </modified> 1.460 + </history> 1.461 +EOT 1.462 + } 1.463 + 1.464 + print <<EOT; 1.465 + <validity> 1.466 + <state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/> 1.467 + <state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/> 1.468 + <state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/> 1.469 + <state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/> 1.470 + </validity> 1.471 + <assignments sub="3F"> 1.472 +EOT 1.473 + print " <!-- One to one mappings -->\n"; 1.474 + for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { 1.475 + $big5 = $u2b{$unicode}; 1.476 + $u = hex($unicode); 1.477 + next 1.478 + unless defined( $b2u{$big5} ) 1.479 + and $unicode eq $b2u{$big5} 1.480 + and 1.481 + not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 ); 1.482 + printf " <a u=\"%04X\" ", $u; 1.483 + if ( hex($big5) <= 0xFF ) { 1.484 + printf "b=\"%02X\"/>\n", hex($big5); 1.485 + } 1.486 + else { 1.487 + printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ), 1.488 + substr( $big5, 2, 2 ); 1.489 + } 1.490 + } 1.491 + 1.492 + print " <!-- Fallback mappings from Unicode to bytes -->\n"; 1.493 + for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { 1.494 + $big5 = $u2b{$unicode}; 1.495 + next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} ); 1.496 + if ( $unicode eq "F900" ) { 1.497 + print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n"; 1.498 + print 1.499 +" These are included in CP950 (Unicode->Big5 direction only).\n"; 1.500 + print " Should we include this area in TW-BIG5 or not? -->\n"; 1.501 + } 1.502 + printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), 1.503 + substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); 1.504 + } 1.505 + 1.506 + my %fbu; 1.507 + print " <!-- Fallback mappings from bytes to Unicode -->\n"; 1.508 + for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { 1.509 + $unicode = $b2u{$big5}; 1.510 + if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) ) 1.511 + { 1.512 + $fbu{$unicode} = $big5; 1.513 + } 1.514 + } 1.515 + for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) { 1.516 + $big5 = $fbu{$unicode}; 1.517 + printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), 1.518 + substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); 1.519 + } 1.520 + 1.521 + if ( $use_range and !$hkscs_mode ) { 1.522 + print <<EOT; 1.523 + <!-- Roundtrip-mappings that can be enumerated 1.524 + Note: We can only use the <range> tag for TW-BIG5. 1.525 + Big-5E and Big5-HKSCS have assigned characters in these areas, 1.526 + and we will have to use the <a> and <fub> tags instead. 1.527 + --> 1.528 + <!-- User-Defined Area 1 (UDA1) --> 1.529 + <range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/> 1.530 + <!-- User-Defined Area 2 (UDA2) --> 1.531 + <range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/> 1.532 + <!-- User-Defined Area 3 (UDA3) --> 1.533 + <range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/> 1.534 +EOT 1.535 + } 1.536 + 1.537 + print <<EOT; 1.538 + </assignments> 1.539 +</characterMapping> 1.540 +EOT 1.541 + 1.542 +} # gen_charmapml() 1.543 + 1.544 +sub gen_check_b2u() { 1.545 + 1.546 + ########################################################################### 1.547 + # 1.548 + # Codes for generating a raw table for verification and testing 1.549 + # 1.550 + # #print $u2b{"F7D1"}, "\n"; 1.551 + # print $b2u{$u2b{"F7D1"}}, "\n"; 1.552 + # print "FA59 -> U+", $b2u{"FA59"}, "\n"; 1.553 + 1.554 + foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { 1.555 + $unicode = $b2u{$big5}; 1.556 + $big5 =~ s/^00//; 1.557 + print "U+", $unicode, ": ", $big5, "\n"; 1.558 + } 1.559 +} 1.560 + 1.561 +sub gen_check_u2b() { 1.562 + foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { 1.563 + $big5 = $u2b{$unicode}; 1.564 + $big5 =~ s/^00//; 1.565 + print "U+", $unicode, ": ", $big5, "\n"; 1.566 + } 1.567 + 1.568 +} 1.569 + 1.570 +########################################################################### 1.571 +# 1.572 +# Codes for generating hkscs.ut and hkscs.uf files for Mozilla 1.573 +# 1.574 +sub gen_mozilla_uf() { 1.575 + # hkscs.uf 1.576 + foreach $unicode ( sort keys %u2b ) { 1.577 + $big5 = $u2b{$unicode}; 1.578 + my $b = hex($big5); 1.579 + print "0x", uc($big5), "\t0x", uc($unicode), "\n" 1.580 + unless ( $b >= 0xA140 and $b <= 0xC6A0 ) 1.581 + or ( $b >= 0xC940 and $b <= 0xF9D5 ) 1.582 + or ( $b < 0x8140 ) 1.583 + or ( hex($unicode) > 0xFFFF ); 1.584 + } 1.585 +} 1.586 + 1.587 +sub gen_mozilla_ut() { 1.588 + # hkscs.ut 1.589 + foreach $big5 ( sort keys %b2u ) { 1.590 + my $b = hex($big5); 1.591 + print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n" 1.592 + unless ( $b >= 0xA140 and $b <= 0xC6A0 ) 1.593 + or ( $b < 0x8140 ) 1.594 + or ( $b >= 0xC940 and $b <= 0xF9D5 ); 1.595 + } 1.596 +} 1.597 + 1.598 + 1.599 +########################################################################### 1.600 + 1.601 +sub gen_glibc() { 1.602 + 1.603 + ########################################################################## 1.604 + # 1.605 + # Generate index for UCS4 to Big5-HKSCS conversion table 1.606 + # 1.607 + @index_array = (); 1.608 + 1.609 + $mode = 0; 1.610 + $count = 0; 1.611 + for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) { 1.612 + $unicode = sprintf( "%04X", $uni ); 1.613 + 1.614 + # print " /* U+$unicode */\t" if $low % 4 == 0; 1.615 + if ( defined( $u2b{$unicode} ) ) { 1.616 + if ( $mode == 0 ) { 1.617 + $range_start = $range_end = $uni; 1.618 + 1.619 + # printf " { %7s, ", sprintf("0x%04X", $range_start); 1.620 + $mode = 1; 1.621 + } 1.622 + else { 1.623 + $range_end = $uni; 1.624 + } 1.625 + } 1.626 + elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) { 1.627 + 1.628 + # Start a new range if the gap is 0x80 or larger 1.629 + # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count; 1.630 + push @index_array, [ ( $range_start, $range_end, $count ) ]; 1.631 + $count += $range_end - $range_start + 1; 1.632 + $mode = 0; 1.633 + } 1.634 + } 1.635 + 1.636 + # 1.637 + # Note that $count and $range_end are used again as global variables 1.638 + # below 1.639 + # 1.640 + 1.641 + ########################################################################### 1.642 + # 1.643 + # Start generating real C code... 1.644 + # 1.645 + 1.646 + print <<'EOT'; 1.647 +/* Mapping tables for Big5-HKSCS handling. 1.648 + Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. 1.649 + This file is part of the GNU C Library. 1.650 + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. 1.651 + Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000. 1.652 + Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn> 1.653 + and Anthony Fok <anthony@thizlinux.com>, 2002 1.654 + 1.655 + The GNU C Library is free software; you can redistribute it and/or 1.656 + modify it under the terms of the GNU Lesser General Public 1.657 + License as published by the Free Software Foundation; either 1.658 + version 2.1 of the License, or (at your option) any later version. 1.659 + 1.660 + The GNU C Library is distributed in the hope that it will be useful, 1.661 + but WITHOUT ANY WARRANTY; without even the implied warranty of 1.662 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.663 + Lesser General Public License for more details. 1.664 + 1.665 + You should have received a copy of the GNU Lesser General Public 1.666 + License along with the GNU C Library; if not, write to the Free 1.667 + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 1.668 + 02111-1307 USA. */ 1.669 + 1.670 +#include <dlfcn.h> 1.671 +#include <gconv.h> 1.672 +#include <stdint.h> 1.673 +#include <stdlib.h> 1.674 +#include <string.h> 1.675 +#include <wchar.h> 1.676 + 1.677 + 1.678 +/* Table for Big5-HKSCS to UCS conversion. 1.679 + 1.680 + Original comments by Roger So when he updated the tables for HKSCS-1999: 1.681 + 1.682 + With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info: 1.683 + http://www.digital21.gov.hk/eng/hkscs/index.html 1.684 + - spacehunt 07/01/2000 1.685 + 1.686 + The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt 1.687 + and big5cmp.txt using a Perl script while merging C source code from 1.688 + other developers. A copy of the source Perl script is available at: 1.689 + 1.690 + http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl 1.691 + http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl 1.692 + 1.693 + Revisions: 1.694 + 2001-10-30 made codec for Qt 1.695 + 2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001 1.696 + 1.697 + Todo: 1.698 + Use a hash for characters beyond BMP to save space and make it 1.699 + more efficient 1.700 + 1.701 + - Anthony Fok <anthony@thizlinux.com> 21 Mar 2002 1.702 + On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China 1.703 +*/ 1.704 + 1.705 +EOT 1.706 + 1.707 + ########################################################################## 1.708 + # 1.709 + # Generate Big5-HKSCS to Unicode conversion table 1.710 + # 1.711 + 1.712 + ## print "Big5HKSCS to Unicode\n"; 1.713 + 1.714 + # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) { 1.715 + 1.716 + $high_start = 0x88; 1.717 + $high_end = 0xfe; 1.718 + 1.719 + print "static const uint16_t big5_hkscs_to_ucs["; 1.720 + print( ( $high_end - $high_start + 1 ) * 157 ); 1.721 + print "] =\n{\n"; 1.722 + for $high ( 0x88 .. 0xfe ) { 1.723 + for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) { 1.724 + if ( $low == 0x40 ) { 1.725 + print "\n" unless $high == $high_start; 1.726 + printf 1.727 + "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n", 1.728 + $high, $high, $high, $high; 1.729 + } 1.730 + elsif ( $low == 0xa1 ) { 1.731 + print "\t\t"; 1.732 + } 1.733 + $big5 = sprintf( "%02X%02X", $high, $low ); 1.734 + print "\t" if $low % 8 == 0; 1.735 + if ( defined( $b2u{$big5} ) ) { 1.736 + $unicode = $b2u{$big5}; 1.737 + print "0x", $unicode, ","; 1.738 + } 1.739 + else { 1.740 + print "0x0000,"; # for glibc 1.741 + } 1.742 + print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) 1.743 + ? "\n" 1.744 + : "\t" ); 1.745 + } 1.746 + } 1.747 + print "};\n\n"; 1.748 + 1.749 + ########################################################################## 1.750 + # 1.751 + # Generate Unicode to Big5-HKSCS conversion table 1.752 + # 1.753 + print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n"; 1.754 + foreach $index (@index_array) { 1.755 + ( $start, $end ) = ( @$index[0], @$index[1] ); 1.756 + printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 ); 1.757 + print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 ); 1.758 + for ( $i = $start ; $i <= $end ; $i++ ) { 1.759 + printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 ); 1.760 + $unicode = sprintf( "%04X", $i ); 1.761 + if ( defined( $big5 = $u2b{$unicode} ) ) { 1.762 + if ( $big5 =~ /^00/ ) { 1.763 + print '"\x', substr( $big5, 2, 2 ), '\x00",'; 1.764 + } 1.765 + else { 1.766 + print '"\x', substr( $big5, 0, 2 ), '\x', 1.767 + substr( $big5, 2, 2 ), '",'; 1.768 + } 1.769 + } 1.770 + else { 1.771 + print '"\x00\x00",'; 1.772 + } 1.773 + print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end; 1.774 + } 1.775 + print $end == $range_end ? "\n" : "\n\n"; 1.776 + } 1.777 + print "};\n\n"; 1.778 + 1.779 + ########################################################################### 1.780 + 1.781 + print <<EOT; 1.782 +static struct 1.783 +{ 1.784 + /* Note: We are going to split this table so that we can use 1.785 + uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */ 1.786 + uint32_t from; 1.787 + uint32_t to; 1.788 + uint32_t offset; 1.789 +} from_ucs4_idx[] = 1.790 +{ 1.791 +EOT 1.792 + foreach $index (@index_array) { 1.793 + printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ), 1.794 + sprintf( "0x%04X", @$index[1] ), @$index[2]; 1.795 + } 1.796 + print "};\n\n"; 1.797 + 1.798 + #foreach $i (sort keys %b2u) { 1.799 + # print $b2u{$i} . ' '; 1.800 + #} 1.801 + 1.802 + print <<'EOT'; 1.803 +/* Definitions used in the body of the `gconv' function. */ 1.804 +#define CHARSET_NAME "BIG5HKSCS//" 1.805 +#define FROM_LOOP from_big5 1.806 +#define TO_LOOP to_big5 1.807 +#define DEFINE_INIT 1 1.808 +#define DEFINE_FINI 1 1.809 +#define MIN_NEEDED_FROM 1 1.810 +#define MAX_NEEDED_FROM 2 1.811 +#define MIN_NEEDED_TO 4 1.812 + 1.813 + 1.814 +/* First define the conversion function from Big5-HKSCS to UCS4. */ 1.815 +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1.816 +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM 1.817 +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1.818 +#define LOOPFCT FROM_LOOP 1.819 +#define BODY \ 1.820 + { \ 1.821 + uint32_t ch = *inptr; \ 1.822 + \ 1.823 + if (ch >= 0x81 && ch <= 0xfe) \ 1.824 + { \ 1.825 + /* Two-byte character. First test whether the next character \ 1.826 + is also available. */ \ 1.827 + uint32_t ch2; \ 1.828 + int idx; \ 1.829 + \ 1.830 + if (__builtin_expect (inptr + 1 >= inend, 0)) \ 1.831 + { \ 1.832 + /* The second character is not available. */ \ 1.833 + result = __GCONV_INCOMPLETE_INPUT; \ 1.834 + break; \ 1.835 + } \ 1.836 + \ 1.837 + ch2 = inptr[1]; \ 1.838 + /* See whether the second byte is in the correct range. */ \ 1.839 + if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \ 1.840 + { \ 1.841 + if (ch >= 0x88) \ 1.842 + { \ 1.843 + /* Look up the table */ \ 1.844 + idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \ 1.845 + if ((ch = big5_hkscs_to_ucs[idx]) == 0) \ 1.846 + { \ 1.847 + /* This is illegal. */ \ 1.848 + if (! ignore_errors_p ()) \ 1.849 + { \ 1.850 + result = __GCONV_ILLEGAL_INPUT; \ 1.851 + break; \ 1.852 + } \ 1.853 + \ 1.854 + ++inptr; \ 1.855 + ++*irreversible; \ 1.856 + continue; \ 1.857 + } \ 1.858 + } \ 1.859 + else \ 1.860 + { \ 1.861 + /* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \ 1.862 + ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \ 1.863 + + 0xeeb8; \ 1.864 + } \ 1.865 + } \ 1.866 + else \ 1.867 + { \ 1.868 + /* This is illegal. */ \ 1.869 + if (! ignore_errors_p ()) \ 1.870 + { \ 1.871 + result = __GCONV_ILLEGAL_INPUT; \ 1.872 + break; \ 1.873 + } \ 1.874 + \ 1.875 + ++inptr; \ 1.876 + ++*irreversible; \ 1.877 + continue; \ 1.878 + } \ 1.879 + \ 1.880 + inptr += 2; \ 1.881 + } \ 1.882 + else if (__builtin_expect (ch, 0) == 0xff) \ 1.883 + { \ 1.884 + result = __GCONV_ILLEGAL_INPUT; \ 1.885 + break; \ 1.886 + } \ 1.887 + else /* 0x00 to 0x80 */ \ 1.888 + ++inptr; \ 1.889 + \ 1.890 + put32 (outptr, ch); \ 1.891 + outptr += 4; \ 1.892 + } 1.893 +#define LOOP_NEED_FLAGS 1.894 +#include <iconv/loop.c> 1.895 + 1.896 + 1.897 +/* Next, define the other direction. */ 1.898 +#define MIN_NEEDED_INPUT MIN_NEEDED_TO 1.899 +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM 1.900 +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM 1.901 +#define LOOPFCT TO_LOOP 1.902 +#define BODY \ 1.903 + { \ 1.904 + uint32_t ch = get32 (inptr); \ 1.905 + const unsigned char *cp = ""; \ 1.906 + unsigned char b5ch[2] = "\0\0"; \ 1.907 + int i; \ 1.908 + \ 1.909 + for (i = 0; \ 1.910 + i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \ 1.911 + ++i) \ 1.912 + { \ 1.913 + if (ch < from_ucs4_idx[i].from) \ 1.914 + break; \ 1.915 + if (from_ucs4_idx[i].to >= ch) \ 1.916 + { \ 1.917 + cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \ 1.918 + + ch - from_ucs4_idx[i].from]; \ 1.919 + break; \ 1.920 + } \ 1.921 + } \ 1.922 + \ 1.923 + if (ch <= 0x80) \ 1.924 + { \ 1.925 + b5ch[0] = ch; \ 1.926 + cp = b5ch; \ 1.927 + } \ 1.928 + \ 1.929 + if (cp[0] == '\0' && ch != 0) \ 1.930 + { \ 1.931 + UNICODE_TAG_HANDLER (ch, 4); \ 1.932 + \ 1.933 + /* Illegal character. */ \ 1.934 + STANDARD_ERR_HANDLER (4); \ 1.935 + } \ 1.936 + else \ 1.937 + { \ 1.938 + /* See whether there is enough room for the second byte we write. */ \ 1.939 + if (__builtin_expect (cp[1], '\1') != '\0' \ 1.940 + && __builtin_expect (outptr + 1 >= outend, 0)) \ 1.941 + { \ 1.942 + /* We have not enough room. */ \ 1.943 + result = __GCONV_FULL_OUTPUT; \ 1.944 + break; \ 1.945 + } \ 1.946 + \ 1.947 + *outptr++ = cp[0]; \ 1.948 + if (cp[1] != '\0') \ 1.949 + *outptr++ = cp[1]; \ 1.950 + } \ 1.951 + \ 1.952 + inptr += 4; \ 1.953 + } 1.954 +#define LOOP_NEED_FLAGS 1.955 +#include <iconv/loop.c> 1.956 + 1.957 + 1.958 +/* Now define the toplevel functions. */ 1.959 +#include <iconv/skeleton.c> 1.960 +EOT 1.961 + 1.962 +}