michael@0: #!/usr/bin/perl -w
michael@0: #
michael@0: #  gen-big5hkscs-2001-mozilla.pl
michael@0: #      a Perl script that generates Big5-HKSCS <-> Unicode
michael@0: #      conversion tables for Mozilla
michael@0: #
michael@0: #  Author (of the original Perl script):
michael@0: #      Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
michael@0: #  Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
michael@0: #  License: GNU General Public License, v2 or later.
michael@0: #
michael@0: #  This version includes original C source code from
michael@0: #  glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
michael@0: #  Roger So <roger.so@sw-linux.com>
michael@0: #
michael@0: #                         First attempt for Qt-2.3.x: 2001-09-21
michael@0: #                     A working version for Qt-2.3.x: 2001-10-30
michael@0: #              Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
michael@0: #  Adapted to generate conversion tables for Mozilla: 2002-11-26
michael@0: #  Adapted to generate conversion tables for Mozilla: 2002-11-30
michael@0: #                     Cleaned up the script somewhat: 2002-12-04
michael@0: # Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
michael@0: #
michael@0: #  Notes:
michael@0: #
michael@0: #   1. The latest version of this script may be found in:
michael@0: #          http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
michael@0: #          http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
michael@0: #      Or, better yet, e-mail me and ask for the latest version.
michael@0: #
michael@0: #   2. This script generates data from 3 tables:
michael@0: #       a. http://www.microsoft.com/typography/unicode/950.txt
michael@0: #       b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
michael@0: #       c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
michael@0: #
michael@0: #      Make sure your big5-iso.txt is the latest HKSCS-2001 version.
michael@0: #
michael@0: #   3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
michael@0: #       different areas similar to the way Ulrich and Roger did it,
michael@0: #       but extended for HKSCS-2001.
michael@0: #
michael@0: #   4. [Mozilla]: This script is very quick-and-dirty in some places.
michael@0: #       Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
michael@0: #       the appropriate tables for feeding into "fromu" or "tou".
michael@0: #
michael@0: #   5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
michael@0: #       Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
michael@0: #       Otherwise, this script would generate a HKSCS table.
michael@0: #       (Yes, I know, I should clean up this script and make it more modular,
michael@0: #       and with command-line options or whatnot.  I'll do that later.  :-)
michael@0: #
michael@0: #  If you have any questions or concerns, please feel free to contact me
michael@0: #  at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org>  :-)
michael@0: #
michael@0: #  Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
michael@0: #  for their generous support in this work.
michael@0: #
michael@0: 
michael@0: # 1. UDA3, 0x8840 - 0x8dfe
michael@0: # 2. UDA2, 0x8e40 - 0xa0fe
michael@0: # 3. VDA,  0xc6a1 - 0xc8fe
michael@0: 
michael@0: #use Getopt::Std;
michael@0: 
michael@0: my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
michael@0: 
michael@0: my $debug = 0;
michael@0: my $hkscs_mode = 1;
michael@0: my $kangxi = 0;
michael@0: my $use_range  = 0;
michael@0: my $bmp_only  = 1;
michael@0: 
michael@0: #
michael@0: # Subroutine Declaration
michael@0: #
michael@0: sub read_cp950();
michael@0: sub adjust_radicals();
michael@0: sub read_hkscs_main();
michael@0: sub read_hkscs_cmp();
michael@0: sub post_tuning();
michael@0: sub gen_charmapml();
michael@0: sub gen_check_b2u();
michael@0: sub gen_check_u2b();
michael@0: sub gen_mozilla_uf();
michael@0: sub gen_mozilla_ut();
michael@0: sub gen_glibc();
michael@0: 
michael@0: ###########################################################################
michael@0: #
michael@0: # Main program
michael@0: #
michael@0: 
michael@0: # First, read Microsoft's CP950 as base Big5.
michael@0: read_cp950 ();
michael@0: 
michael@0: # Add mappings to Kangxi Radicals.
michael@0: # The b2u direction is added only if $kangxi is not null.
michael@0: adjust_radicals ();
michael@0: 
michael@0: # Then, read the HKSCS table.
michael@0: # Again, see the $hkscs_mode variable.
michael@0: read_hkscs_main ();
michael@0: read_hkscs_cmp () if $hkscs_mode;
michael@0: 
michael@0: post_tuning ();
michael@0: 
michael@0: 
michael@0: # Then, choose one of the following:
michael@0: #gen_charmapml();
michael@0: gen_mozilla_uf();
michael@0: #gen_mozilla_ut();
michael@0: #gen_check_u2b();
michael@0: #gen_glibc();
michael@0: 
michael@0: 
michael@0: # End of program
michael@0: exit 0;
michael@0: 
michael@0: 
michael@0: #############################################################################
michael@0: #
michael@0: #  Subroutines
michael@0: #
michael@0: 
michael@0: sub read_cp950() {
michael@0:     open( CP950, "950.txt" ) or die;
michael@0:     my $mode = 0;
michael@0:     while (<CP950>) {
michael@0:         s/\r//;
michael@0:         chomp;
michael@0:         next if /^$/;
michael@0:         last if /^ENDCODEPAGE/;
michael@0: 
michael@0:         if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
michael@0:             $mode = 1;
michael@0:             ( $count, $high ) = ( $1, $2 );
michael@0:             $i = 0;
michael@0:             next;
michael@0:         }
michael@0:         if (/^WCTABLE (\d+)/) {
michael@0:             $mode  = 2;
michael@0:             $count = $1;
michael@0:             $i     = 0;
michael@0:             next;
michael@0:         }
michael@0:         next if $mode == 0;
michael@0: 
michael@0:         if ( $mode == 1 ) {
michael@0:             ( $low, $unicode, $comment ) = split "\t";
michael@0:             $low     =~ s/^0x//;
michael@0:             $unicode =~ s/^0x//;
michael@0:             $big5 = $high . $low;
michael@0:             $b2u{ uc($big5) } = uc($unicode);
michael@0:             if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
michael@0:         }
michael@0: 
michael@0:         if ( $mode == 2 ) {
michael@0:             ( $unicode, $big5, $comment ) = split "\t";
michael@0:             $unicode =~ s/^0x//;
michael@0:             $big5    =~ s/^0x//;
michael@0:             my $u = hex($unicode);
michael@0:             my $b = hex($big5);
michael@0: 
michael@0:             $u2b{ uc($unicode) } = uc($big5) unless
michael@0: 
michael@0:               # Skip Microsoft's over-generous (or over-zealous?) mappings
michael@0:               # "Faked" accented latin characters
michael@0:               ( $b <= 0xFF and $b != $u )
michael@0: 
michael@0:               # "Faked" Ideographic Annotation ___ Mark
michael@0:               or ( $u >= 0x3192 and $u <= 0x319F )
michael@0: 
michael@0:               # "Faked" Parenthesized Ideograph ___
michael@0:               or ( $u >= 0x3220 and $u <= 0x3243 )
michael@0: 
michael@0:               # "Faked" Circled Ideograph ___ except Circled Ideograph Correct
michael@0:               or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
michael@0: 
michael@0:               # ￠￡￥’μ﹐
michael@0:               or ( $u == 0xA2
michael@0:                 or $u == 0xA3
michael@0:                 or $u == 0xA5
michael@0:                 or $u == 0xB4
michael@0:                 or $u == 0xB5
michael@0:                 or $u == 0xB8 )
michael@0: 
michael@0:               # ¯─∥‧˙〃￣﹨°≡︴⊙⊕～﹋
michael@0:               or ( $u == 0x0305		# ???
michael@0:                 or $u == 0x2015
michael@0:                 or $u == 0x2016
michael@0:                 or $u == 0x2022
michael@0:                 or $u == 0x2024
michael@0:                 or $u == 0x2033
michael@0:                 or $u == 0x203E		# ???
michael@0:                 or $u == 0x2216
michael@0:                 or $u == 0x2218
michael@0:                 or $u == 0x2263
michael@0:                 or $u == 0x2307
michael@0:                 or $u == 0x2609
michael@0:                 or $u == 0x2641
michael@0:                 or $u == 0x301C
michael@0:                 or $u == 0x3030 )
michael@0: 
michael@0:               # ︿‘﹑
michael@0:               or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
michael@0: 
michael@0:             if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: sub adjust_radicals() {
michael@0: 
michael@0:     # B5+C6BF - B5+C6D7: Radicals (?)
michael@0: 
michael@0:     # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
michael@0:     #
michael@0:     #   Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
michael@0:     #   Note that HKSCS does not explicitly define
michael@0:     #       B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (廴、无、癶、隶),
michael@0:     #   but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
michael@0:     #   mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
michael@0:     #
michael@0:     #   As for B5+C6CD (⼳), HKSCS maps it to U+2F33 just like TW-BIG5.
michael@0:     #   However, it also maps B5+FBF4 (幺) to U+5E7A.
michael@0:     $b2u{"C6BF"} = "2F02" if $kangxi;
michael@0:     $u2b{"2F02"} = "C6BF";              # 丶
michael@0:     $b2u{"C6C0"} = "2F03" if $kangxi;
michael@0:     $u2b{"2F03"} = "C6C0";              # 丿
michael@0:     $b2u{"C6C1"} = "2F05" if $kangxi;
michael@0:     $u2b{"2F05"} = "C6C1";              # 亅
michael@0:     $b2u{"C6C2"} = "2F07" if $kangxi;
michael@0:     $u2b{"2F07"} = "C6C2";              # 亠
michael@0:     $b2u{"C6C3"} = "2F0C" if $kangxi;
michael@0:     $u2b{"2F0C"} = "C6C3";              # 冂
michael@0:     $b2u{"C6C4"} = "2F0D" if $kangxi;
michael@0:     $u2b{"2F0D"} = "C6C4";              # 冖
michael@0:     $b2u{"C6C5"} = "2F0E" if $kangxi;
michael@0:     $u2b{"2F0E"} = "C6C5";              # 冫
michael@0:     $b2u{"C6C6"} = "2F13" if $kangxi;
michael@0:     $u2b{"2F13"} = "C6C6";              # 勹
michael@0:     $b2u{"C6C7"} = "2F16" if $kangxi;
michael@0:     $u2b{"2F16"} = "C6C7";              # 匸
michael@0:     $b2u{"C6C8"} = "2F19" if $kangxi;
michael@0:     $u2b{"2F19"} = "C6C8";              # 卩
michael@0:     $b2u{"C6C9"} = "2F1B" if $kangxi;
michael@0:     $u2b{"2F1B"} = "C6C9";              # 厶
michael@0:     $b2u{"C6CA"} = "2F22" if $kangxi;
michael@0:     $u2b{"2F22"} = "C6CA";              # 夊
michael@0:     $b2u{"C6CB"} = "2F27" if $kangxi;
michael@0:     $u2b{"2F27"} = "C6CB";              # 宀
michael@0:     $b2u{"C6CC"} = "2F2E" if $kangxi;
michael@0:     $u2b{"2F2E"} = "C6CC";              # 巛
michael@0:     $b2u{"C6CD"} = "2F33" if $kangxi;
michael@0:     $u2b{"2F33"} = "C6CD";              # ⼳
michael@0:     $b2u{"C6CE"} = "2F34" if $kangxi;
michael@0:     $u2b{"2F34"} = "C6CE";              # 广
michael@0:     $b2u{"C6CF"} = "2F35" if $kangxi;
michael@0:     $u2b{"2F35"} = "C6CF";              # 廴
michael@0:     $b2u{"C6D0"} = "2F39" if $kangxi;
michael@0:     $u2b{"2F39"} = "C6D0";              # 彐
michael@0:     $b2u{"C6D1"} = "2F3A" if $kangxi;
michael@0:     $u2b{"2F3A"} = "C6D1";              # 彡
michael@0:     $b2u{"C6D2"} = "2F41" if $kangxi;
michael@0:     $u2b{"2F41"} = "C6D2";              # 攴
michael@0:     $b2u{"C6D3"} = "2F46" if $kangxi;
michael@0:     $u2b{"2F46"} = "C6D3";              # 无
michael@0:     $b2u{"C6D4"} = "2F67" if $kangxi;
michael@0:     $u2b{"2F67"} = "C6D4";              # 疒
michael@0:     $b2u{"C6D5"} = "2F68" if $kangxi;
michael@0:     $u2b{"2F68"} = "C6D5";              # 癶
michael@0:     $b2u{"C6D6"} = "2FA1" if $kangxi;
michael@0:     $u2b{"2FA1"} = "C6D6";              # 辵
michael@0:     $b2u{"C6D7"} = "2FAA" if $kangxi;
michael@0:     $u2b{"2FAA"} = "C6D7";              # 隶
michael@0: }
michael@0: 
michael@0: sub read_hkscs_main() {
michael@0: 
michael@0:     open( B2U, "<big5-iso.txt" ) or die;
michael@0:     while (<B2U>) {
michael@0:         next
michael@0:           unless
michael@0: /([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
michael@0:         ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
michael@0: 
michael@0:         my $b = hex($big5);
michael@0: 
michael@0:         # For non-HKSCS mode, only take data in the VDA range (?)
michael@0:         next unless $hkscs_mode
michael@0: 
michael@0:           # Note that we don't go from B5+C6A1-B5+C6FE, but rather only
michael@0:           # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
michael@0:           # because C8D4-C8FE are not assigned in TW-BIG5
michael@0:           # if we are to follow Arphic PL Big-5 fonts.  (To be discussed)
michael@0:           or
michael@0:           ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
michael@0:           or ( $b >= 0xF9D6 && $b <= 0xF9FE );
michael@0: 
michael@0:         print STDERR
michael@0:           "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
michael@0:           if $debug
michael@0:           and defined( $b2u{$big5} )
michael@0:           and $b2u{$big5} ne $iso2000;
michael@0: 
michael@0:         $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
michael@0:           unless !$hkscs_mode
michael@0:           and $b == 0xF9FE;
michael@0: 
michael@0:         # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
michael@0:         # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
michael@0:         # Which is more correct?  I don't know!  (To be discussed)
michael@0: 
michael@0:         print STDERR
michael@0:           "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
michael@0:           if $debug
michael@0:           and defined( $u2b{$iso1993} )
michael@0:           and $u2b{$iso1993} ne $big5;
michael@0: 
michael@0:         $u2b{$iso1993} = $big5;
michael@0: 
michael@0:         print STDERR
michael@0:           "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
michael@0:           if $debug
michael@0:           and defined( $u2b{$iso2000} )
michael@0:           and $u2b{$iso2000} ne $big5;
michael@0: 
michael@0:         $u2b{$iso2000} = $big5;
michael@0: 
michael@0:         print STDERR
michael@0:           "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
michael@0:           if $debug
michael@0:           and defined( $u2b{$iso2001} )
michael@0:           and $u2b{$iso2001} ne $big5;
michael@0: 
michael@0:         $u2b{$iso2001} = $big5;
michael@0:     }
michael@0:     close B2U;
michael@0: 
michael@0: }    # read_hkscs_main()
michael@0: 
michael@0: 
michael@0: sub read_hkscs_cmp() {
michael@0: 
michael@0:     ###########################################################################
michael@0:     # Add Big5 compatibility coding...
michael@0:     #
michael@0:     # Stephan, here is the code segment that you may want to implement
michael@0:     # in your convertbig5hkscs2001.pl 
michael@0:     #
michael@0:     open( B5CMP, "<big5cmp.txt" ) or die;
michael@0:     $mode = 0;
michael@0:     while (<B5CMP>) {
michael@0:         if (/^=====/) { $mode = 1; next; }
michael@0:         next if $mode == 0;
michael@0:         last if $mode == 1 and /^\s+/;
michael@0:         chomp;
michael@0:         my ( $big5cmp, $big5 ) = split " ";
michael@0: 
michael@0:         $big5cmp = uc($big5cmp);
michael@0:         $big5    = uc($big5);
michael@0:         my $uni    = $b2u{$big5};
michael@0:         my $unicmp = $b2u{$big5cmp};
michael@0: 
michael@0:         print STDERR
michael@0:           "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
michael@0:           if $debug;
michael@0:         $b2u{$big5cmp} = $uni;
michael@0:         $u2b{$unicmp}  = $big5;
michael@0:         print STDERR
michael@0:           "Now:  U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
michael@0:           if $debug;
michael@0:     }
michael@0:     close B5CMP;
michael@0: }    # read_hkscs_cmp();
michael@0: 
michael@0: 
michael@0: sub post_tuning() {
michael@0: 
michael@0:     # And finally, fine-tuning...
michael@0:     for $i ( 0x00 .. 0x80 ) {
michael@0:         $big5 = $unicode = sprintf( "%04X", $i );
michael@0:         $b2u{$big5} = $unicode;
michael@0:     }
michael@0: 
michael@0:     # Add Euro '€' (I wonder why this 950.txt doesn't have it.)
michael@0:     $b2u{"A3E1"} = "20AC";
michael@0:     $u2b{"20AC"} = "A3E1";
michael@0: 
michael@0:     # Box drawing characters:
michael@0:     # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
michael@0:     # (To be discussed)
michael@0:     if ( !$hkscs_mode ) {
michael@0:         $u2b{"2550"} = "A2A4";    # Big5: ═	(also B5-F9F9)
michael@0:         $u2b{"255E"} = "A2A5";    # Big5: ╞	(also B5-F9E9)
michael@0:         $u2b{"2561"} = "A2A7";    # Big5: ╡	(also B5-F9EB)
michael@0:         $u2b{"256A"} = "A2A6";    # Big5: ╪	(also B5-F9EA)
michael@0:         $u2b{"256D"} = "A27E";    # Big5: ╭	(also B5-F9FA)
michael@0:         $u2b{"256E"} = "A2A1";    # Big5: ╮	(also B5-F9FB)
michael@0:         $u2b{"256F"} = "A2A3";    # Big5: ╯	(also B5-F9FD)
michael@0:         $u2b{"2570"} = "A2A2";    # Big5: ╰	(also B5-F9FC)
michael@0:     }
michael@0: 
michael@0:     # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (十卄卅)
michael@0:     # (To be discussed)
michael@0:     if ( !$hkscs_mode ) {
michael@0:         $b2u{"A2CC"} = "3038";
michael@0:         $u2b{"3038"} = "A2CC";
michael@0:         $b2u{"A2CD"} = "3039";
michael@0:         $u2b{"3039"} = "A2CD";
michael@0:         $b2u{"A2CE"} = "303A";
michael@0:         $u2b{"303A"} = "A2CE";
michael@0:     }
michael@0: 
michael@0:     # The character for ethnic group "Yi" (彝):
michael@0:     # (To be discussed)
michael@0:     $u2b{"5F5E"} = "C255";    # Always add this.
michael@0:     if ( !$hkscs_mode ) {
michael@0:         $b2u{"C255"} = "5F5E";
michael@0:     }
michael@0: 
michael@0: }    # post_tuning()
michael@0: 
michael@0: 
michael@0: sub gen_charmapml() {
michael@0: 
michael@0:     ###########################################################################
michael@0:     #
michael@0:     #  Codes for generating CharMapML XML file
michael@0: 
michael@0:     print <<EOT;
michael@0: <?xml version="1.0" encoding="UTF-8" ?>
michael@0: <!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
michael@0: EOT
michael@0: 
michael@0:     if ($hkscs_mode) {
michael@0:         print <<EOT;
michael@0: <characterMapping id="big5-hkscs-2001" version="1">
michael@0:  <history>
michael@0:   <modified version="1" date="2002-11-30">
michael@0:    Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
michael@0:    with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
michael@0:    and with some other manual tweaking.
michael@0:   </modified>
michael@0:  </history>
michael@0: EOT
michael@0:     }
michael@0:     else {
michael@0:         print <<EOT;
michael@0: <characterMapping id="tw-big5-2002" version="1">
michael@0:  <history>
michael@0:   <modified version="1" date="2002-11-30">
michael@0:    Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
michael@0:    with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
michael@0:    and with some other manual tweaking.
michael@0:   </modified>
michael@0:  </history>
michael@0: EOT
michael@0:     }
michael@0: 
michael@0:     print <<EOT;
michael@0:  <validity>
michael@0:   <state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
michael@0:   <state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
michael@0:   <state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
michael@0:   <state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
michael@0:  </validity>
michael@0:  <assignments sub="3F">
michael@0: EOT
michael@0:     print "  <!-- One to one mappings -->\n";
michael@0:     for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
michael@0:         $big5 = $u2b{$unicode};
michael@0:         $u    = hex($unicode);
michael@0:         next
michael@0:           unless defined( $b2u{$big5} )
michael@0:           and $unicode eq $b2u{$big5}
michael@0:           and
michael@0:           not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
michael@0:         printf "  <a u=\"%04X\" ", $u;
michael@0:         if ( hex($big5) <= 0xFF ) {
michael@0:             printf "b=\"%02X\"/>\n", hex($big5);
michael@0:         }
michael@0:         else {
michael@0:             printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
michael@0:               substr( $big5, 2, 2 );
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     print "  <!-- Fallback mappings from Unicode to bytes -->\n";
michael@0:     for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
michael@0:         $big5 = $u2b{$unicode};
michael@0:         next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
michael@0:         if ( $unicode eq "F900" ) {
michael@0:             print "  <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
michael@0:             print
michael@0: "       These are included in CP950 (Unicode->Big5 direction only).\n";
michael@0:             print "       Should we include this area in TW-BIG5 or not? -->\n";
michael@0:         }
michael@0:         printf "  <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
michael@0:           substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
michael@0:     }
michael@0: 
michael@0:     my %fbu;
michael@0:     print "  <!-- Fallback mappings from bytes to Unicode -->\n";
michael@0:     for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
michael@0:         $unicode = $b2u{$big5};
michael@0:         if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
michael@0:         {
michael@0:             $fbu{$unicode} = $big5;
michael@0:         }
michael@0:     }
michael@0:     for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
michael@0:         $big5 = $fbu{$unicode};
michael@0:         printf "  <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
michael@0:           substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
michael@0:     }
michael@0: 
michael@0:     if ( $use_range and !$hkscs_mode ) {
michael@0:         print <<EOT;
michael@0:   <!-- Roundtrip-mappings that can be enumerated
michael@0:        Note: We can only use the <range> tag for TW-BIG5.
michael@0:              Big-5E and Big5-HKSCS have assigned characters in these areas,
michael@0: 	     and we will have to use the <a> and <fub> tags instead.
michael@0:     -->
michael@0:   <!-- User-Defined Area 1 (UDA1) -->
michael@0:   <range uFirst="E000" uLast="E310"  bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
michael@0:   <!-- User-Defined Area 2 (UDA2) -->
michael@0:   <range uFirst="E311" uLast="EEB7"  bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
michael@0:   <!-- User-Defined Area 3 (UDA3) -->
michael@0:   <range uFirst="EEB8" uLast="F6B0"  bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
michael@0: EOT
michael@0:     }
michael@0: 
michael@0:     print <<EOT;
michael@0:  </assignments>
michael@0: </characterMapping>
michael@0: EOT
michael@0: 
michael@0: }    # gen_charmapml()
michael@0: 
michael@0: sub gen_check_b2u() {
michael@0: 
michael@0:     ###########################################################################
michael@0:     #
michael@0:     #  Codes for generating a raw table for verification and testing
michael@0:     #
michael@0:     # #print $u2b{"F7D1"}, "\n";
michael@0:     # print $b2u{$u2b{"F7D1"}}, "\n";
michael@0:     # print "FA59 -> U+", $b2u{"FA59"}, "\n";
michael@0: 
michael@0:     foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
michael@0:         $unicode = $b2u{$big5};
michael@0:         $big5 =~ s/^00//;
michael@0:         print "U+", $unicode, ": ", $big5, "\n";
michael@0:     }
michael@0: }
michael@0: 
michael@0: sub gen_check_u2b() {
michael@0:     foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
michael@0:         $big5 = $u2b{$unicode};
michael@0:         $big5 =~ s/^00//;
michael@0:         print "U+", $unicode, ": ", $big5, "\n";
michael@0:     }
michael@0: 
michael@0: }
michael@0: 
michael@0: ###########################################################################
michael@0: #
michael@0: #  Codes for generating hkscs.ut and hkscs.uf files for Mozilla
michael@0: #
michael@0: sub gen_mozilla_uf() {
michael@0:     # hkscs.uf
michael@0:     foreach $unicode ( sort keys %u2b ) {
michael@0:         $big5 = $u2b{$unicode};
michael@0: 	my $b = hex($big5);
michael@0:         print "0x", uc($big5), "\t0x", uc($unicode), "\n"
michael@0:           unless ( $b >= 0xA140 and $b <= 0xC6A0 )
michael@0:           or ( $b >= 0xC940 and $b <= 0xF9D5 )
michael@0:           or ( $b < 0x8140 )
michael@0:           or ( hex($unicode) > 0xFFFF );
michael@0:     }
michael@0: }
michael@0: 
michael@0: sub gen_mozilla_ut() {
michael@0:     # hkscs.ut
michael@0:     foreach $big5 ( sort keys %b2u ) {
michael@0:         my $b = hex($big5);
michael@0:         print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
michael@0:           unless ( $b >= 0xA140 and $b <= 0xC6A0 )
michael@0: 	  or ( $b < 0x8140 )
michael@0:           or ( $b >= 0xC940 and $b <= 0xF9D5 );
michael@0:     }
michael@0: }
michael@0: 
michael@0: 
michael@0: ###########################################################################
michael@0: 
michael@0: sub gen_glibc() {
michael@0: 
michael@0:     ##########################################################################
michael@0:     #
michael@0:     #   Generate index for UCS4 to Big5-HKSCS conversion table
michael@0:     #
michael@0:     @index_array = ();
michael@0: 
michael@0:     $mode  = 0;
michael@0:     $count = 0;
michael@0:     for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
michael@0:         $unicode = sprintf( "%04X", $uni );
michael@0: 
michael@0:         # print "  /* U+$unicode */\t" if $low % 4 == 0;
michael@0:         if ( defined( $u2b{$unicode} ) ) {
michael@0:             if ( $mode == 0 ) {
michael@0:                 $range_start = $range_end = $uni;
michael@0: 
michael@0:                 # printf "  { %7s, ", sprintf("0x%04X", $range_start);
michael@0:                 $mode = 1;
michael@0:             }
michael@0:             else {
michael@0:                 $range_end = $uni;
michael@0:             }
michael@0:         }
michael@0:         elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
michael@0: 
michael@0:             # Start a new range if the gap is 0x80 or larger
michael@0:             # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
michael@0:             push @index_array, [ ( $range_start, $range_end, $count ) ];
michael@0:             $count += $range_end - $range_start + 1;
michael@0:             $mode = 0;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     #
michael@0:     #  Note that $count and $range_end are used again as global variables
michael@0:     #  below
michael@0:     #
michael@0: 
michael@0:     ###########################################################################
michael@0:     #
michael@0:     #  Start generating real C code...
michael@0:     #
michael@0: 
michael@0:     print <<'EOT';
michael@0: /* Mapping tables for Big5-HKSCS handling.
michael@0:    Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
michael@0:    This file is part of the GNU C Library.
michael@0:    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
michael@0:    Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
michael@0:    Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
michael@0:                          and Anthony Fok <anthony@thizlinux.com>, 2002
michael@0: 
michael@0:    The GNU C Library is free software; you can redistribute it and/or
michael@0:    modify it under the terms of the GNU Lesser General Public
michael@0:    License as published by the Free Software Foundation; either
michael@0:    version 2.1 of the License, or (at your option) any later version.
michael@0: 
michael@0:    The GNU C Library is distributed in the hope that it will be useful,
michael@0:    but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
michael@0:    Lesser General Public License for more details.
michael@0: 
michael@0:    You should have received a copy of the GNU Lesser General Public
michael@0:    License along with the GNU C Library; if not, write to the Free
michael@0:    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
michael@0:    02111-1307 USA.  */
michael@0: 
michael@0: #include <dlfcn.h>
michael@0: #include <gconv.h>
michael@0: #include <stdint.h>
michael@0: #include <stdlib.h>
michael@0: #include <string.h>
michael@0: #include <wchar.h>
michael@0: 
michael@0: 
michael@0: /* Table for Big5-HKSCS to UCS conversion.
michael@0: 
michael@0:    Original comments by Roger So when he updated the tables for HKSCS-1999:
michael@0: 
michael@0:      With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
michael@0:      http://www.digital21.gov.hk/eng/hkscs/index.html
michael@0:        - spacehunt 07/01/2000
michael@0: 
michael@0:    The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
michael@0:    and big5cmp.txt using a Perl script while merging C source code from
michael@0:    other developers.  A copy of the source Perl script is available at:
michael@0: 
michael@0:       http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
michael@0:       http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
michael@0: 
michael@0:   Revisions:
michael@0:     2001-10-30  made codec for Qt
michael@0:     2002-03-21  ported to glibc-2.2.5 and added HKSCS-2001
michael@0: 
michael@0:   Todo:
michael@0:     Use a hash for characters beyond BMP to save space and make it
michael@0:     more efficient
michael@0: 
michael@0:    - Anthony Fok <anthony@thizlinux.com>  21 Mar 2002
michael@0:      On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
michael@0: */
michael@0: 
michael@0: EOT
michael@0: 
michael@0:     ##########################################################################
michael@0:     #
michael@0:     # Generate Big5-HKSCS to Unicode conversion table
michael@0:     #
michael@0: 
michael@0:     ## print "Big5HKSCS to Unicode\n";
michael@0: 
michael@0:     # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
michael@0: 
michael@0:     $high_start = 0x88;
michael@0:     $high_end   = 0xfe;
michael@0: 
michael@0:     print "static const uint16_t big5_hkscs_to_ucs[";
michael@0:     print( ( $high_end - $high_start + 1 ) * 157 );
michael@0:     print "] =\n{\n";
michael@0:     for $high ( 0x88 .. 0xfe ) {
michael@0:         for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
michael@0:             if ( $low == 0x40 ) {
michael@0:                 print "\n" unless $high == $high_start;
michael@0:                 printf
michael@0:                   "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
michael@0:                   $high, $high, $high, $high;
michael@0:             }
michael@0:             elsif ( $low == 0xa1 ) {
michael@0:                 print "\t\t";
michael@0:             }
michael@0:             $big5 = sprintf( "%02X%02X", $high, $low );
michael@0:             print "\t" if $low % 8 == 0;
michael@0:             if ( defined( $b2u{$big5} ) ) {
michael@0:                 $unicode = $b2u{$big5};
michael@0:                 print "0x", $unicode, ",";
michael@0:             }
michael@0:             else {
michael@0:                 print "0x0000,";    # for glibc
michael@0:             }
michael@0:             print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) 
michael@0:                 ? "\n"
michael@0:                 : "\t" );
michael@0:         }
michael@0:     }
michael@0:     print "};\n\n";
michael@0: 
michael@0:     ##########################################################################
michael@0:     #
michael@0:     # Generate Unicode to Big5-HKSCS conversion table
michael@0:     #
michael@0:     print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
michael@0:     foreach $index (@index_array) {
michael@0:         ( $start, $end ) = ( @$index[0], @$index[1] );
michael@0:         printf( "  /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
michael@0:         print "\t" x ( ( $start % 4 ) * 1.5 ) . "    " x ( $start % 2 );
michael@0:         for ( $i = $start ; $i <= $end ; $i++ ) {
michael@0:             printf( "  /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
michael@0:             $unicode = sprintf( "%04X", $i );
michael@0:             if ( defined( $big5 = $u2b{$unicode} ) ) {
michael@0:                 if ( $big5 =~ /^00/ ) {
michael@0:                     print '"\x', substr( $big5, 2, 2 ), '\x00",';
michael@0:                 }
michael@0:                 else {
michael@0:                     print '"\x', substr( $big5, 0, 2 ), '\x',
michael@0:                       substr( $big5, 2, 2 ), '",';
michael@0:                 }
michael@0:             }
michael@0:             else {
michael@0:                 print '"\x00\x00",';
michael@0:             }
michael@0:             print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
michael@0:         }
michael@0:         print $end == $range_end ? "\n" : "\n\n";
michael@0:     }
michael@0:     print "};\n\n";
michael@0: 
michael@0:     ###########################################################################
michael@0: 
michael@0:     print <<EOT;
michael@0: static struct
michael@0: {
michael@0:     /* Note: We are going to split this table so that we can use
michael@0:        uint16_t for "from" and "to" again.  Anthony Fok, 2002-03-21 */
michael@0:     uint32_t from;
michael@0:     uint32_t to;
michael@0:     uint32_t offset;
michael@0: } from_ucs4_idx[] =
michael@0: {
michael@0: EOT
michael@0:     foreach $index (@index_array) {
michael@0:         printf "    { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
michael@0:           sprintf( "0x%04X", @$index[1] ), @$index[2];
michael@0:     }
michael@0:     print "};\n\n";
michael@0: 
michael@0:     #foreach $i (sort keys %b2u) {
michael@0:     #    print $b2u{$i} . ' ';
michael@0:     #}
michael@0: 
michael@0:     print <<'EOT';
michael@0: /* Definitions used in the body of the `gconv' function.  */
michael@0: #define CHARSET_NAME		"BIG5HKSCS//"
michael@0: #define FROM_LOOP		from_big5
michael@0: #define TO_LOOP			to_big5
michael@0: #define DEFINE_INIT		1
michael@0: #define DEFINE_FINI		1
michael@0: #define MIN_NEEDED_FROM		1
michael@0: #define MAX_NEEDED_FROM		2
michael@0: #define MIN_NEEDED_TO		4
michael@0: 
michael@0: 
michael@0: /* First define the conversion function from Big5-HKSCS to UCS4.  */
michael@0: #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
michael@0: #define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
michael@0: #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
michael@0: #define LOOPFCT			FROM_LOOP
michael@0: #define BODY \
michael@0:   {									      \
michael@0:     uint32_t ch = *inptr;						      \
michael@0: 									      \
michael@0:     if (ch >= 0x81 && ch <= 0xfe)					      \
michael@0:       {									      \
michael@0: 	/* Two-byte character.  First test whether the next character	      \
michael@0: 	   is also available.  */					      \
michael@0: 	uint32_t ch2;							      \
michael@0: 	int idx;							      \
michael@0: 									      \
michael@0: 	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
michael@0: 	  {								      \
michael@0: 	    /* The second character is not available.  */		      \
michael@0: 	    result = __GCONV_INCOMPLETE_INPUT;				      \
michael@0: 	    break;							      \
michael@0: 	  }								      \
michael@0: 									      \
michael@0: 	ch2 = inptr[1];							      \
michael@0: 	/* See whether the second byte is in the correct range.  */	      \
michael@0: 	if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe))     \
michael@0: 	  {								      \
michael@0: 	    if (ch >= 0x88)						      \
michael@0: 	      {								      \
michael@0: 		/* Look up the table */					      \
michael@0: 		idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62);  \
michael@0: 		if ((ch = big5_hkscs_to_ucs[idx]) == 0)			      \
michael@0: 		  {							      \
michael@0: 		    /* This is illegal.  */				      \
michael@0: 		    if (! ignore_errors_p ())				      \
michael@0: 		      {							      \
michael@0: 			result = __GCONV_ILLEGAL_INPUT;			      \
michael@0: 			break;						      \
michael@0: 		      }							      \
michael@0: 									      \
michael@0: 		    ++inptr;						      \
michael@0: 		    ++*irreversible;					      \
michael@0: 		    continue;						      \
michael@0: 		  }							      \
michael@0: 	      }								      \
michael@0: 	    else							      \
michael@0: 	      {								      \
michael@0: 		/* 0x81..0x87 in UDA3, currently maps linearly to PUA */      \
michael@0: 		ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62)    \
michael@0: 		      + 0xeeb8;						      \
michael@0: 	      }								      \
michael@0: 	  }								      \
michael@0: 	else								      \
michael@0: 	  {								      \
michael@0: 	    /* This is illegal.  */					      \
michael@0: 	    if (! ignore_errors_p ())					      \
michael@0: 	      {								      \
michael@0: 		result = __GCONV_ILLEGAL_INPUT;				      \
michael@0: 		break;							      \
michael@0: 	      }								      \
michael@0: 									      \
michael@0: 	    ++inptr;							      \
michael@0: 	    ++*irreversible;						      \
michael@0: 	    continue;							      \
michael@0: 	  }								      \
michael@0: 									      \
michael@0: 	inptr += 2;							      \
michael@0:       }									      \
michael@0:     else if (__builtin_expect (ch, 0) == 0xff)				      \
michael@0:       {									      \
michael@0: 	result = __GCONV_ILLEGAL_INPUT;					      \
michael@0: 	break;								      \
michael@0:       }									      \
michael@0:     else  /* 0x00 to 0x80 */						      \
michael@0:       ++inptr;								      \
michael@0: 									      \
michael@0:     put32 (outptr, ch);							      \
michael@0:     outptr += 4;							      \
michael@0:   }
michael@0: #define LOOP_NEED_FLAGS
michael@0: #include <iconv/loop.c>
michael@0: 
michael@0: 
michael@0: /* Next, define the other direction.  */
michael@0: #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
michael@0: #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
michael@0: #define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
michael@0: #define LOOPFCT			TO_LOOP
michael@0: #define BODY \
michael@0:   {									      \
michael@0:     uint32_t ch = get32 (inptr);					      \
michael@0:     const unsigned char *cp = "";						      \
michael@0:     unsigned char b5ch[2] = "\0\0";					      \
michael@0:     int i;								      \
michael@0:     									      \
michael@0:     for (i = 0;								      \
michael@0: 	 i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0]));      \
michael@0: 	 ++i)								      \
michael@0:       {									      \
michael@0: 	if (ch < from_ucs4_idx[i].from)					      \
michael@0: 	  break;							      \
michael@0: 	if (from_ucs4_idx[i].to >= ch)					      \
michael@0: 	  {								      \
michael@0: 	    cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset		      \
michael@0: 			  + ch - from_ucs4_idx[i].from];		      \
michael@0: 	    break;							      \
michael@0: 	  }								      \
michael@0:       }									      \
michael@0: 									      \
michael@0:     if (ch <= 0x80)							      \
michael@0:       {									      \
michael@0: 	b5ch[0] = ch;							      \
michael@0: 	cp = b5ch;							      \
michael@0:       }									      \
michael@0: 									      \
michael@0:     if (cp[0] == '\0' && ch != 0)					      \
michael@0:       {									      \
michael@0: 	UNICODE_TAG_HANDLER (ch, 4);					      \
michael@0: 									      \
michael@0: 	/* Illegal character.  */					      \
michael@0: 	STANDARD_ERR_HANDLER (4);					      \
michael@0:       }									      \
michael@0:     else								      \
michael@0:       {									      \
michael@0: 	/* See whether there is enough room for the second byte we write.  */ \
michael@0: 	if (__builtin_expect (cp[1], '\1') != '\0'			      \
michael@0: 	    && __builtin_expect (outptr + 1 >= outend, 0))		      \
michael@0: 	  {								      \
michael@0: 	    /* We have not enough room.  */				      \
michael@0: 	    result = __GCONV_FULL_OUTPUT;				      \
michael@0: 	    break;							      \
michael@0: 	  }								      \
michael@0: 									      \
michael@0: 	*outptr++ = cp[0];						      \
michael@0: 	if (cp[1] != '\0')						      \
michael@0: 	  *outptr++ = cp[1];						      \
michael@0:       }									      \
michael@0: 									      \
michael@0:     inptr += 4;								      \
michael@0:   }
michael@0: #define LOOP_NEED_FLAGS
michael@0: #include <iconv/loop.c>
michael@0: 
michael@0: 
michael@0: /* Now define the toplevel functions.  */
michael@0: #include <iconv/skeleton.c>
michael@0: EOT
michael@0: 
michael@0: }