michael@0: #!/usr/bin/perl -w michael@0: # michael@0: # gen-big5hkscs-2001-mozilla.pl michael@0: # a Perl script that generates Big5-HKSCS <-> Unicode michael@0: # conversion tables for Mozilla michael@0: # michael@0: # Author (of the original Perl script): michael@0: # Anthony Fok michael@0: # Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. michael@0: # License: GNU General Public License, v2 or later. michael@0: # michael@0: # This version includes original C source code from michael@0: # glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper michael@0: # Roger So michael@0: # michael@0: # First attempt for Qt-2.3.x: 2001-09-21 michael@0: # A working version for Qt-2.3.x: 2001-10-30 michael@0: # Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21 michael@0: # Adapted to generate conversion tables for Mozilla: 2002-11-26 michael@0: # Adapted to generate conversion tables for Mozilla: 2002-11-30 michael@0: # Cleaned up the script somewhat: 2002-12-04 michael@0: # Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10 michael@0: # michael@0: # Notes: michael@0: # michael@0: # 1. The latest version of this script may be found in: michael@0: # http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl michael@0: # http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl michael@0: # Or, better yet, e-mail me and ask for the latest version. michael@0: # michael@0: # 2. This script generates data from 3 tables: michael@0: # a. http://www.microsoft.com/typography/unicode/950.txt michael@0: # b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt michael@0: # c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt michael@0: # michael@0: # Make sure your big5-iso.txt is the latest HKSCS-2001 version. michael@0: # michael@0: # 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into michael@0: # different areas similar to the way Ulrich and Roger did it, michael@0: # but extended for HKSCS-2001. michael@0: # michael@0: # 4. [Mozilla]: This script is very quick-and-dirty in some places. michael@0: # Call either gen_mozilla_uf() or gen_mozilla_ut() to generate michael@0: # the appropriate tables for feeding into "fromu" or "tou". michael@0: # michael@0: # 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized. michael@0: # Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode. michael@0: # Otherwise, this script would generate a HKSCS table. michael@0: # (Yes, I know, I should clean up this script and make it more modular, michael@0: # and with command-line options or whatnot. I'll do that later. :-) michael@0: # michael@0: # If you have any questions or concerns, please feel free to contact me michael@0: # at Anthony Fok or :-) michael@0: # michael@0: # Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK) michael@0: # for their generous support in this work. michael@0: # michael@0: michael@0: # 1. UDA3, 0x8840 - 0x8dfe michael@0: # 2. UDA2, 0x8e40 - 0xa0fe michael@0: # 3. VDA, 0xc6a1 - 0xc8fe michael@0: michael@0: #use Getopt::Std; michael@0: michael@0: my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count ); michael@0: michael@0: my $debug = 0; michael@0: my $hkscs_mode = 1; michael@0: my $kangxi = 0; michael@0: my $use_range = 0; michael@0: my $bmp_only = 1; michael@0: michael@0: # michael@0: # Subroutine Declaration michael@0: # michael@0: sub read_cp950(); michael@0: sub adjust_radicals(); michael@0: sub read_hkscs_main(); michael@0: sub read_hkscs_cmp(); michael@0: sub post_tuning(); michael@0: sub gen_charmapml(); michael@0: sub gen_check_b2u(); michael@0: sub gen_check_u2b(); michael@0: sub gen_mozilla_uf(); michael@0: sub gen_mozilla_ut(); michael@0: sub gen_glibc(); michael@0: michael@0: ########################################################################### michael@0: # michael@0: # Main program michael@0: # michael@0: michael@0: # First, read Microsoft's CP950 as base Big5. michael@0: read_cp950 (); michael@0: michael@0: # Add mappings to Kangxi Radicals. michael@0: # The b2u direction is added only if $kangxi is not null. michael@0: adjust_radicals (); michael@0: michael@0: # Then, read the HKSCS table. michael@0: # Again, see the $hkscs_mode variable. michael@0: read_hkscs_main (); michael@0: read_hkscs_cmp () if $hkscs_mode; michael@0: michael@0: post_tuning (); michael@0: michael@0: michael@0: # Then, choose one of the following: michael@0: #gen_charmapml(); michael@0: gen_mozilla_uf(); michael@0: #gen_mozilla_ut(); michael@0: #gen_check_u2b(); michael@0: #gen_glibc(); michael@0: michael@0: michael@0: # End of program michael@0: exit 0; michael@0: michael@0: michael@0: ############################################################################# michael@0: # michael@0: # Subroutines michael@0: # michael@0: michael@0: sub read_cp950() { michael@0: open( CP950, "950.txt" ) or die; michael@0: my $mode = 0; michael@0: while () { michael@0: s/\r//; michael@0: chomp; michael@0: next if /^$/; michael@0: last if /^ENDCODEPAGE/; michael@0: michael@0: if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) { michael@0: $mode = 1; michael@0: ( $count, $high ) = ( $1, $2 ); michael@0: $i = 0; michael@0: next; michael@0: } michael@0: if (/^WCTABLE (\d+)/) { michael@0: $mode = 2; michael@0: $count = $1; michael@0: $i = 0; michael@0: next; michael@0: } michael@0: next if $mode == 0; michael@0: michael@0: if ( $mode == 1 ) { michael@0: ( $low, $unicode, $comment ) = split "\t"; michael@0: $low =~ s/^0x//; michael@0: $unicode =~ s/^0x//; michael@0: $big5 = $high . $low; michael@0: $b2u{ uc($big5) } = uc($unicode); michael@0: if ( ++$i == $count ) { $mode = 0; $count = 0; next; } michael@0: } michael@0: michael@0: if ( $mode == 2 ) { michael@0: ( $unicode, $big5, $comment ) = split "\t"; michael@0: $unicode =~ s/^0x//; michael@0: $big5 =~ s/^0x//; michael@0: my $u = hex($unicode); michael@0: my $b = hex($big5); michael@0: michael@0: $u2b{ uc($unicode) } = uc($big5) unless michael@0: michael@0: # Skip Microsoft's over-generous (or over-zealous?) mappings michael@0: # "Faked" accented latin characters michael@0: ( $b <= 0xFF and $b != $u ) michael@0: michael@0: # "Faked" Ideographic Annotation ___ Mark michael@0: or ( $u >= 0x3192 and $u <= 0x319F ) michael@0: michael@0: # "Faked" Parenthesized Ideograph ___ michael@0: or ( $u >= 0x3220 and $u <= 0x3243 ) michael@0: michael@0: # "Faked" Circled Ideograph ___ except Circled Ideograph Correct michael@0: or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 ) michael@0: michael@0: # ¢F¢G¢D¡¦£g¡M michael@0: or ( $u == 0xA2 michael@0: or $u == 0xA3 michael@0: or $u == 0xA5 michael@0: or $u == 0xB4 michael@0: or $u == 0xB5 michael@0: or $u == 0xB8 ) michael@0: michael@0: # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê michael@0: or ( $u == 0x0305 # ??? michael@0: or $u == 0x2015 michael@0: or $u == 0x2016 michael@0: or $u == 0x2022 michael@0: or $u == 0x2024 michael@0: or $u == 0x2033 michael@0: or $u == 0x203E # ??? michael@0: or $u == 0x2216 michael@0: or $u == 0x2218 michael@0: or $u == 0x2263 michael@0: or $u == 0x2307 michael@0: or $u == 0x2609 michael@0: or $u == 0x2641 michael@0: or $u == 0x301C michael@0: or $u == 0x3030 ) michael@0: michael@0: # ¡s¡¥¡N michael@0: or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 ); michael@0: michael@0: if ( ++$i == $count ) { $mode = 0; $count = 0; next; } michael@0: } michael@0: } michael@0: } michael@0: michael@0: sub adjust_radicals() { michael@0: michael@0: # B5+C6BF - B5+C6D7: Radicals (?) michael@0: michael@0: # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible. michael@0: # michael@0: # Big5-HKSCS tends towards using the character in Unicode CJK Ideographs michael@0: # Note that HKSCS does not explicitly define michael@0: # B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×), michael@0: # but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4, michael@0: # mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively. michael@0: # michael@0: # As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5. michael@0: # However, it also maps B5+FBF4 (ûô) to U+5E7A. michael@0: $b2u{"C6BF"} = "2F02" if $kangxi; michael@0: $u2b{"2F02"} = "C6BF"; # Æ¿ michael@0: $b2u{"C6C0"} = "2F03" if $kangxi; michael@0: $u2b{"2F03"} = "C6C0"; # ÆÀ michael@0: $b2u{"C6C1"} = "2F05" if $kangxi; michael@0: $u2b{"2F05"} = "C6C1"; # ÆÁ michael@0: $b2u{"C6C2"} = "2F07" if $kangxi; michael@0: $u2b{"2F07"} = "C6C2"; # ÆÂ michael@0: $b2u{"C6C3"} = "2F0C" if $kangxi; michael@0: $u2b{"2F0C"} = "C6C3"; # ÆÃ michael@0: $b2u{"C6C4"} = "2F0D" if $kangxi; michael@0: $u2b{"2F0D"} = "C6C4"; # ÆÄ michael@0: $b2u{"C6C5"} = "2F0E" if $kangxi; michael@0: $u2b{"2F0E"} = "C6C5"; # ÆÅ michael@0: $b2u{"C6C6"} = "2F13" if $kangxi; michael@0: $u2b{"2F13"} = "C6C6"; # ÆÆ michael@0: $b2u{"C6C7"} = "2F16" if $kangxi; michael@0: $u2b{"2F16"} = "C6C7"; # ÆÇ michael@0: $b2u{"C6C8"} = "2F19" if $kangxi; michael@0: $u2b{"2F19"} = "C6C8"; # ÆÈ michael@0: $b2u{"C6C9"} = "2F1B" if $kangxi; michael@0: $u2b{"2F1B"} = "C6C9"; # ÆÉ michael@0: $b2u{"C6CA"} = "2F22" if $kangxi; michael@0: $u2b{"2F22"} = "C6CA"; # ÆÊ michael@0: $b2u{"C6CB"} = "2F27" if $kangxi; michael@0: $u2b{"2F27"} = "C6CB"; # ÆË michael@0: $b2u{"C6CC"} = "2F2E" if $kangxi; michael@0: $u2b{"2F2E"} = "C6CC"; # ÆÌ michael@0: $b2u{"C6CD"} = "2F33" if $kangxi; michael@0: $u2b{"2F33"} = "C6CD"; # ÆÍ michael@0: $b2u{"C6CE"} = "2F34" if $kangxi; michael@0: $u2b{"2F34"} = "C6CE"; # ÆÎ michael@0: $b2u{"C6CF"} = "2F35" if $kangxi; michael@0: $u2b{"2F35"} = "C6CF"; # ÆÏ michael@0: $b2u{"C6D0"} = "2F39" if $kangxi; michael@0: $u2b{"2F39"} = "C6D0"; # ÆÐ michael@0: $b2u{"C6D1"} = "2F3A" if $kangxi; michael@0: $u2b{"2F3A"} = "C6D1"; # ÆÑ michael@0: $b2u{"C6D2"} = "2F41" if $kangxi; michael@0: $u2b{"2F41"} = "C6D2"; # ÆÒ michael@0: $b2u{"C6D3"} = "2F46" if $kangxi; michael@0: $u2b{"2F46"} = "C6D3"; # ÆÓ michael@0: $b2u{"C6D4"} = "2F67" if $kangxi; michael@0: $u2b{"2F67"} = "C6D4"; # ÆÔ michael@0: $b2u{"C6D5"} = "2F68" if $kangxi; michael@0: $u2b{"2F68"} = "C6D5"; # ÆÕ michael@0: $b2u{"C6D6"} = "2FA1" if $kangxi; michael@0: $u2b{"2FA1"} = "C6D6"; # ÆÖ michael@0: $b2u{"C6D7"} = "2FAA" if $kangxi; michael@0: $u2b{"2FAA"} = "C6D7"; # Æ× michael@0: } michael@0: michael@0: sub read_hkscs_main() { michael@0: michael@0: open( B2U, ") { michael@0: next michael@0: unless michael@0: /([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/; michael@0: ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 ); michael@0: michael@0: my $b = hex($big5); michael@0: michael@0: # For non-HKSCS mode, only take data in the VDA range (?) michael@0: next unless $hkscs_mode michael@0: michael@0: # Note that we don't go from B5+C6A1-B5+C6FE, but rather only michael@0: # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals) michael@0: # because C8D4-C8FE are not assigned in TW-BIG5 michael@0: # if we are to follow Arphic PL Big-5 fonts. (To be discussed) michael@0: or michael@0: ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) ) michael@0: or ( $b >= 0xF9D6 && $b <= 0xF9FE ); michael@0: michael@0: print STDERR michael@0: "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n" michael@0: if $debug michael@0: and defined( $b2u{$big5} ) michael@0: and $b2u{$big5} ne $iso2000; michael@0: michael@0: $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001 michael@0: unless !$hkscs_mode michael@0: and $b == 0xF9FE; michael@0: michael@0: # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to michael@0: # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively. michael@0: # Which is more correct? I don't know! (To be discussed) michael@0: michael@0: print STDERR michael@0: "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n" michael@0: if $debug michael@0: and defined( $u2b{$iso1993} ) michael@0: and $u2b{$iso1993} ne $big5; michael@0: michael@0: $u2b{$iso1993} = $big5; michael@0: michael@0: print STDERR michael@0: "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n" michael@0: if $debug michael@0: and defined( $u2b{$iso2000} ) michael@0: and $u2b{$iso2000} ne $big5; michael@0: michael@0: $u2b{$iso2000} = $big5; michael@0: michael@0: print STDERR michael@0: "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n" michael@0: if $debug michael@0: and defined( $u2b{$iso2001} ) michael@0: and $u2b{$iso2001} ne $big5; michael@0: michael@0: $u2b{$iso2001} = $big5; michael@0: } michael@0: close B2U; michael@0: michael@0: } # read_hkscs_main() michael@0: michael@0: michael@0: sub read_hkscs_cmp() { michael@0: michael@0: ########################################################################### michael@0: # Add Big5 compatibility coding... michael@0: # michael@0: # Stephan, here is the code segment that you may want to implement michael@0: # in your convertbig5hkscs2001.pl michael@0: # michael@0: open( B5CMP, ") { michael@0: if (/^=====/) { $mode = 1; next; } michael@0: next if $mode == 0; michael@0: last if $mode == 1 and /^\s+/; michael@0: chomp; michael@0: my ( $big5cmp, $big5 ) = split " "; michael@0: michael@0: $big5cmp = uc($big5cmp); michael@0: $big5 = uc($big5); michael@0: my $uni = $b2u{$big5}; michael@0: my $unicmp = $b2u{$big5cmp}; michael@0: michael@0: print STDERR michael@0: "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t" michael@0: if $debug; michael@0: $b2u{$big5cmp} = $uni; michael@0: $u2b{$unicmp} = $big5; michael@0: print STDERR michael@0: "Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n" michael@0: if $debug; michael@0: } michael@0: close B5CMP; michael@0: } # read_hkscs_cmp(); michael@0: michael@0: michael@0: sub post_tuning() { michael@0: michael@0: # And finally, fine-tuning... michael@0: for $i ( 0x00 .. 0x80 ) { michael@0: $big5 = $unicode = sprintf( "%04X", $i ); michael@0: $b2u{$big5} = $unicode; michael@0: } michael@0: michael@0: # Add Euro '£á' (I wonder why this 950.txt doesn't have it.) michael@0: $b2u{"A3E1"} = "20AC"; michael@0: $u2b{"20AC"} = "A3E1"; michael@0: michael@0: # Box drawing characters: michael@0: # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS) michael@0: # (To be discussed) michael@0: if ( !$hkscs_mode ) { michael@0: $u2b{"2550"} = "A2A4"; # Big5: ¢¤ (also B5-F9F9) michael@0: $u2b{"255E"} = "A2A5"; # Big5: ¢¥ (also B5-F9E9) michael@0: $u2b{"2561"} = "A2A7"; # Big5: ¢§ (also B5-F9EB) michael@0: $u2b{"256A"} = "A2A6"; # Big5: ¢¦ (also B5-F9EA) michael@0: $u2b{"256D"} = "A27E"; # Big5: ¢~ (also B5-F9FA) michael@0: $u2b{"256E"} = "A2A1"; # Big5: ¢¡ (also B5-F9FB) michael@0: $u2b{"256F"} = "A2A3"; # Big5: ¢£ (also B5-F9FD) michael@0: $u2b{"2570"} = "A2A2"; # Big5: ¢¢ (also B5-F9FC) michael@0: } michael@0: michael@0: # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î) michael@0: # (To be discussed) michael@0: if ( !$hkscs_mode ) { michael@0: $b2u{"A2CC"} = "3038"; michael@0: $u2b{"3038"} = "A2CC"; michael@0: $b2u{"A2CD"} = "3039"; michael@0: $u2b{"3039"} = "A2CD"; michael@0: $b2u{"A2CE"} = "303A"; michael@0: $u2b{"303A"} = "A2CE"; michael@0: } michael@0: michael@0: # The character for ethnic group "Yi" (ÂU): michael@0: # (To be discussed) michael@0: $u2b{"5F5E"} = "C255"; # Always add this. michael@0: if ( !$hkscs_mode ) { michael@0: $b2u{"C255"} = "5F5E"; michael@0: } michael@0: michael@0: } # post_tuning() michael@0: michael@0: michael@0: sub gen_charmapml() { michael@0: michael@0: ########################################################################### michael@0: # michael@0: # Codes for generating CharMapML XML file michael@0: michael@0: print < michael@0: michael@0: EOT michael@0: michael@0: if ($hkscs_mode) { michael@0: print < michael@0: michael@0: michael@0: Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) michael@0: with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, michael@0: and with some other manual tweaking. michael@0: michael@0: michael@0: EOT michael@0: } michael@0: else { michael@0: print < michael@0: michael@0: michael@0: Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) michael@0: with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, michael@0: and with some other manual tweaking. michael@0: michael@0: michael@0: EOT michael@0: } michael@0: michael@0: print < michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: EOT michael@0: print " \n"; michael@0: for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { michael@0: $big5 = $u2b{$unicode}; michael@0: $u = hex($unicode); michael@0: next michael@0: unless defined( $b2u{$big5} ) michael@0: and $unicode eq $b2u{$big5} michael@0: and michael@0: not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 ); michael@0: printf " \n", hex($big5); michael@0: } michael@0: else { michael@0: printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ), michael@0: substr( $big5, 2, 2 ); michael@0: } michael@0: } michael@0: michael@0: print " \n"; michael@0: for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { michael@0: $big5 = $u2b{$unicode}; michael@0: next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} ); michael@0: if ( $unicode eq "F900" ) { michael@0: print " \n"; michael@0: } michael@0: printf " \n", hex($unicode), michael@0: substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); michael@0: } michael@0: michael@0: my %fbu; michael@0: print " \n"; michael@0: for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { michael@0: $unicode = $b2u{$big5}; michael@0: if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) ) michael@0: { michael@0: $fbu{$unicode} = $big5; michael@0: } michael@0: } michael@0: for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) { michael@0: $big5 = $fbu{$unicode}; michael@0: printf " \n", hex($unicode), michael@0: substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); michael@0: } michael@0: michael@0: if ( $use_range and !$hkscs_mode ) { michael@0: print < tag for TW-BIG5. michael@0: Big-5E and Big5-HKSCS have assigned characters in these areas, michael@0: and we will have to use the and tags instead. michael@0: --> michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: EOT michael@0: } michael@0: michael@0: print < michael@0: michael@0: EOT michael@0: michael@0: } # gen_charmapml() michael@0: michael@0: sub gen_check_b2u() { michael@0: michael@0: ########################################################################### michael@0: # michael@0: # Codes for generating a raw table for verification and testing michael@0: # michael@0: # #print $u2b{"F7D1"}, "\n"; michael@0: # print $b2u{$u2b{"F7D1"}}, "\n"; michael@0: # print "FA59 -> U+", $b2u{"FA59"}, "\n"; michael@0: michael@0: foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { michael@0: $unicode = $b2u{$big5}; michael@0: $big5 =~ s/^00//; michael@0: print "U+", $unicode, ": ", $big5, "\n"; michael@0: } michael@0: } michael@0: michael@0: sub gen_check_u2b() { michael@0: foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { michael@0: $big5 = $u2b{$unicode}; michael@0: $big5 =~ s/^00//; michael@0: print "U+", $unicode, ": ", $big5, "\n"; michael@0: } michael@0: michael@0: } michael@0: michael@0: ########################################################################### michael@0: # michael@0: # Codes for generating hkscs.ut and hkscs.uf files for Mozilla michael@0: # michael@0: sub gen_mozilla_uf() { michael@0: # hkscs.uf michael@0: foreach $unicode ( sort keys %u2b ) { michael@0: $big5 = $u2b{$unicode}; michael@0: my $b = hex($big5); michael@0: print "0x", uc($big5), "\t0x", uc($unicode), "\n" michael@0: unless ( $b >= 0xA140 and $b <= 0xC6A0 ) michael@0: or ( $b >= 0xC940 and $b <= 0xF9D5 ) michael@0: or ( $b < 0x8140 ) michael@0: or ( hex($unicode) > 0xFFFF ); michael@0: } michael@0: } michael@0: michael@0: sub gen_mozilla_ut() { michael@0: # hkscs.ut michael@0: foreach $big5 ( sort keys %b2u ) { michael@0: my $b = hex($big5); michael@0: print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n" michael@0: unless ( $b >= 0xA140 and $b <= 0xC6A0 ) michael@0: or ( $b < 0x8140 ) michael@0: or ( $b >= 0xC940 and $b <= 0xF9D5 ); michael@0: } michael@0: } michael@0: michael@0: michael@0: ########################################################################### michael@0: michael@0: sub gen_glibc() { michael@0: michael@0: ########################################################################## michael@0: # michael@0: # Generate index for UCS4 to Big5-HKSCS conversion table michael@0: # michael@0: @index_array = (); michael@0: michael@0: $mode = 0; michael@0: $count = 0; michael@0: for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) { michael@0: $unicode = sprintf( "%04X", $uni ); michael@0: michael@0: # print " /* U+$unicode */\t" if $low % 4 == 0; michael@0: if ( defined( $u2b{$unicode} ) ) { michael@0: if ( $mode == 0 ) { michael@0: $range_start = $range_end = $uni; michael@0: michael@0: # printf " { %7s, ", sprintf("0x%04X", $range_start); michael@0: $mode = 1; michael@0: } michael@0: else { michael@0: $range_end = $uni; michael@0: } michael@0: } michael@0: elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) { michael@0: michael@0: # Start a new range if the gap is 0x80 or larger michael@0: # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count; michael@0: push @index_array, [ ( $range_start, $range_end, $count ) ]; michael@0: $count += $range_end - $range_start + 1; michael@0: $mode = 0; michael@0: } michael@0: } michael@0: michael@0: # michael@0: # Note that $count and $range_end are used again as global variables michael@0: # below michael@0: # michael@0: michael@0: ########################################################################### michael@0: # michael@0: # Start generating real C code... michael@0: # michael@0: michael@0: print <<'EOT'; michael@0: /* Mapping tables for Big5-HKSCS handling. michael@0: Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. michael@0: This file is part of the GNU C Library. michael@0: Contributed by Ulrich Drepper , 1997. michael@0: Modified for Big5-HKSCS by Roger So , 2000. michael@0: Updated for HKSCS-2001 by James Su michael@0: and Anthony Fok , 2002 michael@0: michael@0: The GNU C Library is free software; you can redistribute it and/or michael@0: modify it under the terms of the GNU Lesser General Public michael@0: License as published by the Free Software Foundation; either michael@0: version 2.1 of the License, or (at your option) any later version. michael@0: michael@0: The GNU C Library is distributed in the hope that it will be useful, michael@0: but WITHOUT ANY WARRANTY; without even the implied warranty of michael@0: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU michael@0: Lesser General Public License for more details. michael@0: michael@0: You should have received a copy of the GNU Lesser General Public michael@0: License along with the GNU C Library; if not, write to the Free michael@0: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA michael@0: 02111-1307 USA. */ michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: michael@0: /* Table for Big5-HKSCS to UCS conversion. michael@0: michael@0: Original comments by Roger So when he updated the tables for HKSCS-1999: michael@0: michael@0: With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info: michael@0: http://www.digital21.gov.hk/eng/hkscs/index.html michael@0: - spacehunt 07/01/2000 michael@0: michael@0: The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt michael@0: and big5cmp.txt using a Perl script while merging C source code from michael@0: other developers. A copy of the source Perl script is available at: michael@0: michael@0: http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl michael@0: http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl michael@0: michael@0: Revisions: michael@0: 2001-10-30 made codec for Qt michael@0: 2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001 michael@0: michael@0: Todo: michael@0: Use a hash for characters beyond BMP to save space and make it michael@0: more efficient michael@0: michael@0: - Anthony Fok 21 Mar 2002 michael@0: On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China michael@0: */ michael@0: michael@0: EOT michael@0: michael@0: ########################################################################## michael@0: # michael@0: # Generate Big5-HKSCS to Unicode conversion table michael@0: # michael@0: michael@0: ## print "Big5HKSCS to Unicode\n"; michael@0: michael@0: # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) { michael@0: michael@0: $high_start = 0x88; michael@0: $high_end = 0xfe; michael@0: michael@0: print "static const uint16_t big5_hkscs_to_ucs["; michael@0: print( ( $high_end - $high_start + 1 ) * 157 ); michael@0: print "] =\n{\n"; michael@0: for $high ( 0x88 .. 0xfe ) { michael@0: for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) { michael@0: if ( $low == 0x40 ) { michael@0: print "\n" unless $high == $high_start; michael@0: printf michael@0: "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n", michael@0: $high, $high, $high, $high; michael@0: } michael@0: elsif ( $low == 0xa1 ) { michael@0: print "\t\t"; michael@0: } michael@0: $big5 = sprintf( "%02X%02X", $high, $low ); michael@0: print "\t" if $low % 8 == 0; michael@0: if ( defined( $b2u{$big5} ) ) { michael@0: $unicode = $b2u{$big5}; michael@0: print "0x", $unicode, ","; michael@0: } michael@0: else { michael@0: print "0x0000,"; # for glibc michael@0: } michael@0: print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) michael@0: ? "\n" michael@0: : "\t" ); michael@0: } michael@0: } michael@0: print "};\n\n"; michael@0: michael@0: ########################################################################## michael@0: # michael@0: # Generate Unicode to Big5-HKSCS conversion table michael@0: # michael@0: print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n"; michael@0: foreach $index (@index_array) { michael@0: ( $start, $end ) = ( @$index[0], @$index[1] ); michael@0: printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 ); michael@0: print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 ); michael@0: for ( $i = $start ; $i <= $end ; $i++ ) { michael@0: printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 ); michael@0: $unicode = sprintf( "%04X", $i ); michael@0: if ( defined( $big5 = $u2b{$unicode} ) ) { michael@0: if ( $big5 =~ /^00/ ) { michael@0: print '"\x', substr( $big5, 2, 2 ), '\x00",'; michael@0: } michael@0: else { michael@0: print '"\x', substr( $big5, 0, 2 ), '\x', michael@0: substr( $big5, 2, 2 ), '",'; michael@0: } michael@0: } michael@0: else { michael@0: print '"\x00\x00",'; michael@0: } michael@0: print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end; michael@0: } michael@0: print $end == $range_end ? "\n" : "\n\n"; michael@0: } michael@0: print "};\n\n"; michael@0: michael@0: ########################################################################### michael@0: michael@0: print <= 0x81 && ch <= 0xfe) \ michael@0: { \ michael@0: /* Two-byte character. First test whether the next character \ michael@0: is also available. */ \ michael@0: uint32_t ch2; \ michael@0: int idx; \ michael@0: \ michael@0: if (__builtin_expect (inptr + 1 >= inend, 0)) \ michael@0: { \ michael@0: /* The second character is not available. */ \ michael@0: result = __GCONV_INCOMPLETE_INPUT; \ michael@0: break; \ michael@0: } \ michael@0: \ michael@0: ch2 = inptr[1]; \ michael@0: /* See whether the second byte is in the correct range. */ \ michael@0: if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \ michael@0: { \ michael@0: if (ch >= 0x88) \ michael@0: { \ michael@0: /* Look up the table */ \ michael@0: idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \ michael@0: if ((ch = big5_hkscs_to_ucs[idx]) == 0) \ michael@0: { \ michael@0: /* This is illegal. */ \ michael@0: if (! ignore_errors_p ()) \ michael@0: { \ michael@0: result = __GCONV_ILLEGAL_INPUT; \ michael@0: break; \ michael@0: } \ michael@0: \ michael@0: ++inptr; \ michael@0: ++*irreversible; \ michael@0: continue; \ michael@0: } \ michael@0: } \ michael@0: else \ michael@0: { \ michael@0: /* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \ michael@0: ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \ michael@0: + 0xeeb8; \ michael@0: } \ michael@0: } \ michael@0: else \ michael@0: { \ michael@0: /* This is illegal. */ \ michael@0: if (! ignore_errors_p ()) \ michael@0: { \ michael@0: result = __GCONV_ILLEGAL_INPUT; \ michael@0: break; \ michael@0: } \ michael@0: \ michael@0: ++inptr; \ michael@0: ++*irreversible; \ michael@0: continue; \ michael@0: } \ michael@0: \ michael@0: inptr += 2; \ michael@0: } \ michael@0: else if (__builtin_expect (ch, 0) == 0xff) \ michael@0: { \ michael@0: result = __GCONV_ILLEGAL_INPUT; \ michael@0: break; \ michael@0: } \ michael@0: else /* 0x00 to 0x80 */ \ michael@0: ++inptr; \ michael@0: \ michael@0: put32 (outptr, ch); \ michael@0: outptr += 4; \ michael@0: } michael@0: #define LOOP_NEED_FLAGS michael@0: #include michael@0: michael@0: michael@0: /* Next, define the other direction. */ michael@0: #define MIN_NEEDED_INPUT MIN_NEEDED_TO michael@0: #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM michael@0: #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM michael@0: #define LOOPFCT TO_LOOP michael@0: #define BODY \ michael@0: { \ michael@0: uint32_t ch = get32 (inptr); \ michael@0: const unsigned char *cp = ""; \ michael@0: unsigned char b5ch[2] = "\0\0"; \ michael@0: int i; \ michael@0: \ michael@0: for (i = 0; \ michael@0: i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \ michael@0: ++i) \ michael@0: { \ michael@0: if (ch < from_ucs4_idx[i].from) \ michael@0: break; \ michael@0: if (from_ucs4_idx[i].to >= ch) \ michael@0: { \ michael@0: cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \ michael@0: + ch - from_ucs4_idx[i].from]; \ michael@0: break; \ michael@0: } \ michael@0: } \ michael@0: \ michael@0: if (ch <= 0x80) \ michael@0: { \ michael@0: b5ch[0] = ch; \ michael@0: cp = b5ch; \ michael@0: } \ michael@0: \ michael@0: if (cp[0] == '\0' && ch != 0) \ michael@0: { \ michael@0: UNICODE_TAG_HANDLER (ch, 4); \ michael@0: \ michael@0: /* Illegal character. */ \ michael@0: STANDARD_ERR_HANDLER (4); \ michael@0: } \ michael@0: else \ michael@0: { \ michael@0: /* See whether there is enough room for the second byte we write. */ \ michael@0: if (__builtin_expect (cp[1], '\1') != '\0' \ michael@0: && __builtin_expect (outptr + 1 >= outend, 0)) \ michael@0: { \ michael@0: /* We have not enough room. */ \ michael@0: result = __GCONV_FULL_OUTPUT; \ michael@0: break; \ michael@0: } \ michael@0: \ michael@0: *outptr++ = cp[0]; \ michael@0: if (cp[1] != '\0') \ michael@0: *outptr++ = cp[1]; \ michael@0: } \ michael@0: \ michael@0: inptr += 4; \ michael@0: } michael@0: #define LOOP_NEED_FLAGS michael@0: #include michael@0: michael@0: michael@0: /* Now define the toplevel functions. */ michael@0: #include michael@0: EOT michael@0: michael@0: }