intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 #!/usr/bin/perl -w
     2 #
     3 #  gen-big5hkscs-2001-mozilla.pl
     4 #      a Perl script that generates Big5-HKSCS <-> Unicode
     5 #      conversion tables for Mozilla
     6 #
     7 #  Author (of the original Perl script):
     8 #      Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
     9 #  Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
    10 #  License: GNU General Public License, v2 or later.
    11 #
    12 #  This version includes original C source code from
    13 #  glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
    14 #  Roger So <roger.so@sw-linux.com>
    15 #
    16 #                         First attempt for Qt-2.3.x: 2001-09-21
    17 #                     A working version for Qt-2.3.x: 2001-10-30
    18 #              Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
    19 #  Adapted to generate conversion tables for Mozilla: 2002-11-26
    20 #  Adapted to generate conversion tables for Mozilla: 2002-11-30
    21 #                     Cleaned up the script somewhat: 2002-12-04
    22 # Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
    23 #
    24 #  Notes:
    25 #
    26 #   1. The latest version of this script may be found in:
    27 #          http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
    28 #          http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
    29 #      Or, better yet, e-mail me and ask for the latest version.
    30 #
    31 #   2. This script generates data from 3 tables:
    32 #       a. http://www.microsoft.com/typography/unicode/950.txt
    33 #       b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
    34 #       c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
    35 #
    36 #      Make sure your big5-iso.txt is the latest HKSCS-2001 version.
    37 #
    38 #   3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
    39 #       different areas similar to the way Ulrich and Roger did it,
    40 #       but extended for HKSCS-2001.
    41 #
    42 #   4. [Mozilla]: This script is very quick-and-dirty in some places.
    43 #       Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
    44 #       the appropriate tables for feeding into "fromu" or "tou".
    45 #
    46 #   5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
    47 #       Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
    48 #       Otherwise, this script would generate a HKSCS table.
    49 #       (Yes, I know, I should clean up this script and make it more modular,
    50 #       and with command-line options or whatnot.  I'll do that later.  :-)
    51 #
    52 #  If you have any questions or concerns, please feel free to contact me
    53 #  at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org>  :-)
    54 #
    55 #  Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
    56 #  for their generous support in this work.
    57 #
    59 # 1. UDA3, 0x8840 - 0x8dfe
    60 # 2. UDA2, 0x8e40 - 0xa0fe
    61 # 3. VDA,  0xc6a1 - 0xc8fe
    63 #use Getopt::Std;
    65 my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
    67 my $debug = 0;
    68 my $hkscs_mode = 1;
    69 my $kangxi = 0;
    70 my $use_range  = 0;
    71 my $bmp_only  = 1;
    73 #
    74 # Subroutine Declaration
    75 #
    76 sub read_cp950();
    77 sub adjust_radicals();
    78 sub read_hkscs_main();
    79 sub read_hkscs_cmp();
    80 sub post_tuning();
    81 sub gen_charmapml();
    82 sub gen_check_b2u();
    83 sub gen_check_u2b();
    84 sub gen_mozilla_uf();
    85 sub gen_mozilla_ut();
    86 sub gen_glibc();
    88 ###########################################################################
    89 #
    90 # Main program
    91 #
    93 # First, read Microsoft's CP950 as base Big5.
    94 read_cp950 ();
    96 # Add mappings to Kangxi Radicals.
    97 # The b2u direction is added only if $kangxi is not null.
    98 adjust_radicals ();
   100 # Then, read the HKSCS table.
   101 # Again, see the $hkscs_mode variable.
   102 read_hkscs_main ();
   103 read_hkscs_cmp () if $hkscs_mode;
   105 post_tuning ();
   108 # Then, choose one of the following:
   109 #gen_charmapml();
   110 gen_mozilla_uf();
   111 #gen_mozilla_ut();
   112 #gen_check_u2b();
   113 #gen_glibc();
   116 # End of program
   117 exit 0;
   120 #############################################################################
   121 #
   122 #  Subroutines
   123 #
   125 sub read_cp950() {
   126     open( CP950, "950.txt" ) or die;
   127     my $mode = 0;
   128     while (<CP950>) {
   129         s/\r//;
   130         chomp;
   131         next if /^$/;
   132         last if /^ENDCODEPAGE/;
   134         if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
   135             $mode = 1;
   136             ( $count, $high ) = ( $1, $2 );
   137             $i = 0;
   138             next;
   139         }
   140         if (/^WCTABLE (\d+)/) {
   141             $mode  = 2;
   142             $count = $1;
   143             $i     = 0;
   144             next;
   145         }
   146         next if $mode == 0;
   148         if ( $mode == 1 ) {
   149             ( $low, $unicode, $comment ) = split "\t";
   150             $low     =~ s/^0x//;
   151             $unicode =~ s/^0x//;
   152             $big5 = $high . $low;
   153             $b2u{ uc($big5) } = uc($unicode);
   154             if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
   155         }
   157         if ( $mode == 2 ) {
   158             ( $unicode, $big5, $comment ) = split "\t";
   159             $unicode =~ s/^0x//;
   160             $big5    =~ s/^0x//;
   161             my $u = hex($unicode);
   162             my $b = hex($big5);
   164             $u2b{ uc($unicode) } = uc($big5) unless
   166               # Skip Microsoft's over-generous (or over-zealous?) mappings
   167               # "Faked" accented latin characters
   168               ( $b <= 0xFF and $b != $u )
   170               # "Faked" Ideographic Annotation ___ Mark
   171               or ( $u >= 0x3192 and $u <= 0x319F )
   173               # "Faked" Parenthesized Ideograph ___
   174               or ( $u >= 0x3220 and $u <= 0x3243 )
   176               # "Faked" Circled Ideograph ___ except Circled Ideograph Correct
   177               or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
   179               # ¢F¢G¢D¡¦£g¡M
   180               or ( $u == 0xA2
   181                 or $u == 0xA3
   182                 or $u == 0xA5
   183                 or $u == 0xB4
   184                 or $u == 0xB5
   185                 or $u == 0xB8 )
   187               # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê
   188               or ( $u == 0x0305		# ???
   189                 or $u == 0x2015
   190                 or $u == 0x2016
   191                 or $u == 0x2022
   192                 or $u == 0x2024
   193                 or $u == 0x2033
   194                 or $u == 0x203E		# ???
   195                 or $u == 0x2216
   196                 or $u == 0x2218
   197                 or $u == 0x2263
   198                 or $u == 0x2307
   199                 or $u == 0x2609
   200                 or $u == 0x2641
   201                 or $u == 0x301C
   202                 or $u == 0x3030 )
   204               # ¡s¡¥¡N
   205               or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
   207             if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
   208         }
   209     }
   210 }
   212 sub adjust_radicals() {
   214     # B5+C6BF - B5+C6D7: Radicals (?)
   216     # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
   217     #
   218     #   Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
   219     #   Note that HKSCS does not explicitly define
   220     #       B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×),
   221     #   but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
   222     #   mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
   223     #
   224     #   As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5.
   225     #   However, it also maps B5+FBF4 (ûô) to U+5E7A.
   226     $b2u{"C6BF"} = "2F02" if $kangxi;
   227     $u2b{"2F02"} = "C6BF";              # Æ¿
   228     $b2u{"C6C0"} = "2F03" if $kangxi;
   229     $u2b{"2F03"} = "C6C0";              # ÆÀ
   230     $b2u{"C6C1"} = "2F05" if $kangxi;
   231     $u2b{"2F05"} = "C6C1";              # ÆÁ
   232     $b2u{"C6C2"} = "2F07" if $kangxi;
   233     $u2b{"2F07"} = "C6C2";              # ÆÂ
   234     $b2u{"C6C3"} = "2F0C" if $kangxi;
   235     $u2b{"2F0C"} = "C6C3";              # ÆÃ
   236     $b2u{"C6C4"} = "2F0D" if $kangxi;
   237     $u2b{"2F0D"} = "C6C4";              # ÆÄ
   238     $b2u{"C6C5"} = "2F0E" if $kangxi;
   239     $u2b{"2F0E"} = "C6C5";              # ÆÅ
   240     $b2u{"C6C6"} = "2F13" if $kangxi;
   241     $u2b{"2F13"} = "C6C6";              # ÆÆ
   242     $b2u{"C6C7"} = "2F16" if $kangxi;
   243     $u2b{"2F16"} = "C6C7";              # ÆÇ
   244     $b2u{"C6C8"} = "2F19" if $kangxi;
   245     $u2b{"2F19"} = "C6C8";              # ÆÈ
   246     $b2u{"C6C9"} = "2F1B" if $kangxi;
   247     $u2b{"2F1B"} = "C6C9";              # ÆÉ
   248     $b2u{"C6CA"} = "2F22" if $kangxi;
   249     $u2b{"2F22"} = "C6CA";              # ÆÊ
   250     $b2u{"C6CB"} = "2F27" if $kangxi;
   251     $u2b{"2F27"} = "C6CB";              # ÆË
   252     $b2u{"C6CC"} = "2F2E" if $kangxi;
   253     $u2b{"2F2E"} = "C6CC";              # ÆÌ
   254     $b2u{"C6CD"} = "2F33" if $kangxi;
   255     $u2b{"2F33"} = "C6CD";              # ÆÍ
   256     $b2u{"C6CE"} = "2F34" if $kangxi;
   257     $u2b{"2F34"} = "C6CE";              # ÆÎ
   258     $b2u{"C6CF"} = "2F35" if $kangxi;
   259     $u2b{"2F35"} = "C6CF";              # ÆÏ
   260     $b2u{"C6D0"} = "2F39" if $kangxi;
   261     $u2b{"2F39"} = "C6D0";              # ÆÐ
   262     $b2u{"C6D1"} = "2F3A" if $kangxi;
   263     $u2b{"2F3A"} = "C6D1";              # ÆÑ
   264     $b2u{"C6D2"} = "2F41" if $kangxi;
   265     $u2b{"2F41"} = "C6D2";              # ÆÒ
   266     $b2u{"C6D3"} = "2F46" if $kangxi;
   267     $u2b{"2F46"} = "C6D3";              # ÆÓ
   268     $b2u{"C6D4"} = "2F67" if $kangxi;
   269     $u2b{"2F67"} = "C6D4";              # ÆÔ
   270     $b2u{"C6D5"} = "2F68" if $kangxi;
   271     $u2b{"2F68"} = "C6D5";              # ÆÕ
   272     $b2u{"C6D6"} = "2FA1" if $kangxi;
   273     $u2b{"2FA1"} = "C6D6";              # ÆÖ
   274     $b2u{"C6D7"} = "2FAA" if $kangxi;
   275     $u2b{"2FAA"} = "C6D7";              # Æ×
   276 }
   278 sub read_hkscs_main() {
   280     open( B2U, "<big5-iso.txt" ) or die;
   281     while (<B2U>) {
   282         next
   283           unless
   284 /([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
   285         ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
   287         my $b = hex($big5);
   289         # For non-HKSCS mode, only take data in the VDA range (?)
   290         next unless $hkscs_mode
   292           # Note that we don't go from B5+C6A1-B5+C6FE, but rather only
   293           # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
   294           # because C8D4-C8FE are not assigned in TW-BIG5
   295           # if we are to follow Arphic PL Big-5 fonts.  (To be discussed)
   296           or
   297           ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
   298           or ( $b >= 0xF9D6 && $b <= 0xF9FE );
   300         print STDERR
   301           "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
   302           if $debug
   303           and defined( $b2u{$big5} )
   304           and $b2u{$big5} ne $iso2000;
   306         $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
   307           unless !$hkscs_mode
   308           and $b == 0xF9FE;
   310         # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
   311         # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
   312         # Which is more correct?  I don't know!  (To be discussed)
   314         print STDERR
   315           "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
   316           if $debug
   317           and defined( $u2b{$iso1993} )
   318           and $u2b{$iso1993} ne $big5;
   320         $u2b{$iso1993} = $big5;
   322         print STDERR
   323           "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
   324           if $debug
   325           and defined( $u2b{$iso2000} )
   326           and $u2b{$iso2000} ne $big5;
   328         $u2b{$iso2000} = $big5;
   330         print STDERR
   331           "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
   332           if $debug
   333           and defined( $u2b{$iso2001} )
   334           and $u2b{$iso2001} ne $big5;
   336         $u2b{$iso2001} = $big5;
   337     }
   338     close B2U;
   340 }    # read_hkscs_main()
   343 sub read_hkscs_cmp() {
   345     ###########################################################################
   346     # Add Big5 compatibility coding...
   347     #
   348     # Stephan, here is the code segment that you may want to implement
   349     # in your convertbig5hkscs2001.pl 
   350     #
   351     open( B5CMP, "<big5cmp.txt" ) or die;
   352     $mode = 0;
   353     while (<B5CMP>) {
   354         if (/^=====/) { $mode = 1; next; }
   355         next if $mode == 0;
   356         last if $mode == 1 and /^\s+/;
   357         chomp;
   358         my ( $big5cmp, $big5 ) = split " ";
   360         $big5cmp = uc($big5cmp);
   361         $big5    = uc($big5);
   362         my $uni    = $b2u{$big5};
   363         my $unicmp = $b2u{$big5cmp};
   365         print STDERR
   366           "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
   367           if $debug;
   368         $b2u{$big5cmp} = $uni;
   369         $u2b{$unicmp}  = $big5;
   370         print STDERR
   371           "Now:  U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
   372           if $debug;
   373     }
   374     close B5CMP;
   375 }    # read_hkscs_cmp();
   378 sub post_tuning() {
   380     # And finally, fine-tuning...
   381     for $i ( 0x00 .. 0x80 ) {
   382         $big5 = $unicode = sprintf( "%04X", $i );
   383         $b2u{$big5} = $unicode;
   384     }
   386     # Add Euro '£á' (I wonder why this 950.txt doesn't have it.)
   387     $b2u{"A3E1"} = "20AC";
   388     $u2b{"20AC"} = "A3E1";
   390     # Box drawing characters:
   391     # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
   392     # (To be discussed)
   393     if ( !$hkscs_mode ) {
   394         $u2b{"2550"} = "A2A4";    # Big5: ¢¤	(also B5-F9F9)
   395         $u2b{"255E"} = "A2A5";    # Big5: ¢¥	(also B5-F9E9)
   396         $u2b{"2561"} = "A2A7";    # Big5: ¢§	(also B5-F9EB)
   397         $u2b{"256A"} = "A2A6";    # Big5: ¢¦	(also B5-F9EA)
   398         $u2b{"256D"} = "A27E";    # Big5: ¢~	(also B5-F9FA)
   399         $u2b{"256E"} = "A2A1";    # Big5: ¢¡	(also B5-F9FB)
   400         $u2b{"256F"} = "A2A3";    # Big5: ¢£	(also B5-F9FD)
   401         $u2b{"2570"} = "A2A2";    # Big5: ¢¢	(also B5-F9FC)
   402     }
   404     # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î)
   405     # (To be discussed)
   406     if ( !$hkscs_mode ) {
   407         $b2u{"A2CC"} = "3038";
   408         $u2b{"3038"} = "A2CC";
   409         $b2u{"A2CD"} = "3039";
   410         $u2b{"3039"} = "A2CD";
   411         $b2u{"A2CE"} = "303A";
   412         $u2b{"303A"} = "A2CE";
   413     }
   415     # The character for ethnic group "Yi" (ÂU):
   416     # (To be discussed)
   417     $u2b{"5F5E"} = "C255";    # Always add this.
   418     if ( !$hkscs_mode ) {
   419         $b2u{"C255"} = "5F5E";
   420     }
   422 }    # post_tuning()
   425 sub gen_charmapml() {
   427     ###########################################################################
   428     #
   429     #  Codes for generating CharMapML XML file
   431     print <<EOT;
   432 <?xml version="1.0" encoding="UTF-8" ?>
   433 <!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
   434 EOT
   436     if ($hkscs_mode) {
   437         print <<EOT;
   438 <characterMapping id="big5-hkscs-2001" version="1">
   439  <history>
   440   <modified version="1" date="2002-11-30">
   441    Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
   442    with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
   443    and with some other manual tweaking.
   444   </modified>
   445  </history>
   446 EOT
   447     }
   448     else {
   449         print <<EOT;
   450 <characterMapping id="tw-big5-2002" version="1">
   451  <history>
   452   <modified version="1" date="2002-11-30">
   453    Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
   454    with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
   455    and with some other manual tweaking.
   456   </modified>
   457  </history>
   458 EOT
   459     }
   461     print <<EOT;
   462  <validity>
   463   <state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
   464   <state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
   465   <state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
   466   <state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
   467  </validity>
   468  <assignments sub="3F">
   469 EOT
   470     print "  <!-- One to one mappings -->\n";
   471     for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
   472         $big5 = $u2b{$unicode};
   473         $u    = hex($unicode);
   474         next
   475           unless defined( $b2u{$big5} )
   476           and $unicode eq $b2u{$big5}
   477           and
   478           not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
   479         printf "  <a u=\"%04X\" ", $u;
   480         if ( hex($big5) <= 0xFF ) {
   481             printf "b=\"%02X\"/>\n", hex($big5);
   482         }
   483         else {
   484             printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
   485               substr( $big5, 2, 2 );
   486         }
   487     }
   489     print "  <!-- Fallback mappings from Unicode to bytes -->\n";
   490     for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
   491         $big5 = $u2b{$unicode};
   492         next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
   493         if ( $unicode eq "F900" ) {
   494             print "  <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
   495             print
   496 "       These are included in CP950 (Unicode->Big5 direction only).\n";
   497             print "       Should we include this area in TW-BIG5 or not? -->\n";
   498         }
   499         printf "  <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
   500           substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
   501     }
   503     my %fbu;
   504     print "  <!-- Fallback mappings from bytes to Unicode -->\n";
   505     for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
   506         $unicode = $b2u{$big5};
   507         if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
   508         {
   509             $fbu{$unicode} = $big5;
   510         }
   511     }
   512     for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
   513         $big5 = $fbu{$unicode};
   514         printf "  <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
   515           substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
   516     }
   518     if ( $use_range and !$hkscs_mode ) {
   519         print <<EOT;
   520   <!-- Roundtrip-mappings that can be enumerated
   521        Note: We can only use the <range> tag for TW-BIG5.
   522              Big-5E and Big5-HKSCS have assigned characters in these areas,
   523 	     and we will have to use the <a> and <fub> tags instead.
   524     -->
   525   <!-- User-Defined Area 1 (UDA1) -->
   526   <range uFirst="E000" uLast="E310"  bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
   527   <!-- User-Defined Area 2 (UDA2) -->
   528   <range uFirst="E311" uLast="EEB7"  bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
   529   <!-- User-Defined Area 3 (UDA3) -->
   530   <range uFirst="EEB8" uLast="F6B0"  bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
   531 EOT
   532     }
   534     print <<EOT;
   535  </assignments>
   536 </characterMapping>
   537 EOT
   539 }    # gen_charmapml()
   541 sub gen_check_b2u() {
   543     ###########################################################################
   544     #
   545     #  Codes for generating a raw table for verification and testing
   546     #
   547     # #print $u2b{"F7D1"}, "\n";
   548     # print $b2u{$u2b{"F7D1"}}, "\n";
   549     # print "FA59 -> U+", $b2u{"FA59"}, "\n";
   551     foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
   552         $unicode = $b2u{$big5};
   553         $big5 =~ s/^00//;
   554         print "U+", $unicode, ": ", $big5, "\n";
   555     }
   556 }
   558 sub gen_check_u2b() {
   559     foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
   560         $big5 = $u2b{$unicode};
   561         $big5 =~ s/^00//;
   562         print "U+", $unicode, ": ", $big5, "\n";
   563     }
   565 }
   567 ###########################################################################
   568 #
   569 #  Codes for generating hkscs.ut and hkscs.uf files for Mozilla
   570 #
   571 sub gen_mozilla_uf() {
   572     # hkscs.uf
   573     foreach $unicode ( sort keys %u2b ) {
   574         $big5 = $u2b{$unicode};
   575 	my $b = hex($big5);
   576         print "0x", uc($big5), "\t0x", uc($unicode), "\n"
   577           unless ( $b >= 0xA140 and $b <= 0xC6A0 )
   578           or ( $b >= 0xC940 and $b <= 0xF9D5 )
   579           or ( $b < 0x8140 )
   580           or ( hex($unicode) > 0xFFFF );
   581     }
   582 }
   584 sub gen_mozilla_ut() {
   585     # hkscs.ut
   586     foreach $big5 ( sort keys %b2u ) {
   587         my $b = hex($big5);
   588         print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
   589           unless ( $b >= 0xA140 and $b <= 0xC6A0 )
   590 	  or ( $b < 0x8140 )
   591           or ( $b >= 0xC940 and $b <= 0xF9D5 );
   592     }
   593 }
   596 ###########################################################################
   598 sub gen_glibc() {
   600     ##########################################################################
   601     #
   602     #   Generate index for UCS4 to Big5-HKSCS conversion table
   603     #
   604     @index_array = ();
   606     $mode  = 0;
   607     $count = 0;
   608     for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
   609         $unicode = sprintf( "%04X", $uni );
   611         # print "  /* U+$unicode */\t" if $low % 4 == 0;
   612         if ( defined( $u2b{$unicode} ) ) {
   613             if ( $mode == 0 ) {
   614                 $range_start = $range_end = $uni;
   616                 # printf "  { %7s, ", sprintf("0x%04X", $range_start);
   617                 $mode = 1;
   618             }
   619             else {
   620                 $range_end = $uni;
   621             }
   622         }
   623         elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
   625             # Start a new range if the gap is 0x80 or larger
   626             # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
   627             push @index_array, [ ( $range_start, $range_end, $count ) ];
   628             $count += $range_end - $range_start + 1;
   629             $mode = 0;
   630         }
   631     }
   633     #
   634     #  Note that $count and $range_end are used again as global variables
   635     #  below
   636     #
   638     ###########################################################################
   639     #
   640     #  Start generating real C code...
   641     #
   643     print <<'EOT';
   644 /* Mapping tables for Big5-HKSCS handling.
   645    Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
   646    This file is part of the GNU C Library.
   647    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
   648    Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
   649    Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
   650                          and Anthony Fok <anthony@thizlinux.com>, 2002
   652    The GNU C Library is free software; you can redistribute it and/or
   653    modify it under the terms of the GNU Lesser General Public
   654    License as published by the Free Software Foundation; either
   655    version 2.1 of the License, or (at your option) any later version.
   657    The GNU C Library is distributed in the hope that it will be useful,
   658    but WITHOUT ANY WARRANTY; without even the implied warranty of
   659    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   660    Lesser General Public License for more details.
   662    You should have received a copy of the GNU Lesser General Public
   663    License along with the GNU C Library; if not, write to the Free
   664    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   665    02111-1307 USA.  */
   667 #include <dlfcn.h>
   668 #include <gconv.h>
   669 #include <stdint.h>
   670 #include <stdlib.h>
   671 #include <string.h>
   672 #include <wchar.h>
   675 /* Table for Big5-HKSCS to UCS conversion.
   677    Original comments by Roger So when he updated the tables for HKSCS-1999:
   679      With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
   680      http://www.digital21.gov.hk/eng/hkscs/index.html
   681        - spacehunt 07/01/2000
   683    The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
   684    and big5cmp.txt using a Perl script while merging C source code from
   685    other developers.  A copy of the source Perl script is available at:
   687       http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
   688       http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
   690   Revisions:
   691     2001-10-30  made codec for Qt
   692     2002-03-21  ported to glibc-2.2.5 and added HKSCS-2001
   694   Todo:
   695     Use a hash for characters beyond BMP to save space and make it
   696     more efficient
   698    - Anthony Fok <anthony@thizlinux.com>  21 Mar 2002
   699      On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
   700 */
   702 EOT
   704     ##########################################################################
   705     #
   706     # Generate Big5-HKSCS to Unicode conversion table
   707     #
   709     ## print "Big5HKSCS to Unicode\n";
   711     # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
   713     $high_start = 0x88;
   714     $high_end   = 0xfe;
   716     print "static const uint16_t big5_hkscs_to_ucs[";
   717     print( ( $high_end - $high_start + 1 ) * 157 );
   718     print "] =\n{\n";
   719     for $high ( 0x88 .. 0xfe ) {
   720         for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
   721             if ( $low == 0x40 ) {
   722                 print "\n" unless $high == $high_start;
   723                 printf
   724                   "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
   725                   $high, $high, $high, $high;
   726             }
   727             elsif ( $low == 0xa1 ) {
   728                 print "\t\t";
   729             }
   730             $big5 = sprintf( "%02X%02X", $high, $low );
   731             print "\t" if $low % 8 == 0;
   732             if ( defined( $b2u{$big5} ) ) {
   733                 $unicode = $b2u{$big5};
   734                 print "0x", $unicode, ",";
   735             }
   736             else {
   737                 print "0x0000,";    # for glibc
   738             }
   739             print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) 
   740                 ? "\n"
   741                 : "\t" );
   742         }
   743     }
   744     print "};\n\n";
   746     ##########################################################################
   747     #
   748     # Generate Unicode to Big5-HKSCS conversion table
   749     #
   750     print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
   751     foreach $index (@index_array) {
   752         ( $start, $end ) = ( @$index[0], @$index[1] );
   753         printf( "  /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
   754         print "\t" x ( ( $start % 4 ) * 1.5 ) . "    " x ( $start % 2 );
   755         for ( $i = $start ; $i <= $end ; $i++ ) {
   756             printf( "  /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
   757             $unicode = sprintf( "%04X", $i );
   758             if ( defined( $big5 = $u2b{$unicode} ) ) {
   759                 if ( $big5 =~ /^00/ ) {
   760                     print '"\x', substr( $big5, 2, 2 ), '\x00",';
   761                 }
   762                 else {
   763                     print '"\x', substr( $big5, 0, 2 ), '\x',
   764                       substr( $big5, 2, 2 ), '",';
   765                 }
   766             }
   767             else {
   768                 print '"\x00\x00",';
   769             }
   770             print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
   771         }
   772         print $end == $range_end ? "\n" : "\n\n";
   773     }
   774     print "};\n\n";
   776     ###########################################################################
   778     print <<EOT;
   779 static struct
   780 {
   781     /* Note: We are going to split this table so that we can use
   782        uint16_t for "from" and "to" again.  Anthony Fok, 2002-03-21 */
   783     uint32_t from;
   784     uint32_t to;
   785     uint32_t offset;
   786 } from_ucs4_idx[] =
   787 {
   788 EOT
   789     foreach $index (@index_array) {
   790         printf "    { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
   791           sprintf( "0x%04X", @$index[1] ), @$index[2];
   792     }
   793     print "};\n\n";
   795     #foreach $i (sort keys %b2u) {
   796     #    print $b2u{$i} . ' ';
   797     #}
   799     print <<'EOT';
   800 /* Definitions used in the body of the `gconv' function.  */
   801 #define CHARSET_NAME		"BIG5HKSCS//"
   802 #define FROM_LOOP		from_big5
   803 #define TO_LOOP			to_big5
   804 #define DEFINE_INIT		1
   805 #define DEFINE_FINI		1
   806 #define MIN_NEEDED_FROM		1
   807 #define MAX_NEEDED_FROM		2
   808 #define MIN_NEEDED_TO		4
   811 /* First define the conversion function from Big5-HKSCS to UCS4.  */
   812 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
   813 #define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
   814 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
   815 #define LOOPFCT			FROM_LOOP
   816 #define BODY \
   817   {									      \
   818     uint32_t ch = *inptr;						      \
   819 									      \
   820     if (ch >= 0x81 && ch <= 0xfe)					      \
   821       {									      \
   822 	/* Two-byte character.  First test whether the next character	      \
   823 	   is also available.  */					      \
   824 	uint32_t ch2;							      \
   825 	int idx;							      \
   826 									      \
   827 	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
   828 	  {								      \
   829 	    /* The second character is not available.  */		      \
   830 	    result = __GCONV_INCOMPLETE_INPUT;				      \
   831 	    break;							      \
   832 	  }								      \
   833 									      \
   834 	ch2 = inptr[1];							      \
   835 	/* See whether the second byte is in the correct range.  */	      \
   836 	if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe))     \
   837 	  {								      \
   838 	    if (ch >= 0x88)						      \
   839 	      {								      \
   840 		/* Look up the table */					      \
   841 		idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62);  \
   842 		if ((ch = big5_hkscs_to_ucs[idx]) == 0)			      \
   843 		  {							      \
   844 		    /* This is illegal.  */				      \
   845 		    if (! ignore_errors_p ())				      \
   846 		      {							      \
   847 			result = __GCONV_ILLEGAL_INPUT;			      \
   848 			break;						      \
   849 		      }							      \
   850 									      \
   851 		    ++inptr;						      \
   852 		    ++*irreversible;					      \
   853 		    continue;						      \
   854 		  }							      \
   855 	      }								      \
   856 	    else							      \
   857 	      {								      \
   858 		/* 0x81..0x87 in UDA3, currently maps linearly to PUA */      \
   859 		ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62)    \
   860 		      + 0xeeb8;						      \
   861 	      }								      \
   862 	  }								      \
   863 	else								      \
   864 	  {								      \
   865 	    /* This is illegal.  */					      \
   866 	    if (! ignore_errors_p ())					      \
   867 	      {								      \
   868 		result = __GCONV_ILLEGAL_INPUT;				      \
   869 		break;							      \
   870 	      }								      \
   871 									      \
   872 	    ++inptr;							      \
   873 	    ++*irreversible;						      \
   874 	    continue;							      \
   875 	  }								      \
   876 									      \
   877 	inptr += 2;							      \
   878       }									      \
   879     else if (__builtin_expect (ch, 0) == 0xff)				      \
   880       {									      \
   881 	result = __GCONV_ILLEGAL_INPUT;					      \
   882 	break;								      \
   883       }									      \
   884     else  /* 0x00 to 0x80 */						      \
   885       ++inptr;								      \
   886 									      \
   887     put32 (outptr, ch);							      \
   888     outptr += 4;							      \
   889   }
   890 #define LOOP_NEED_FLAGS
   891 #include <iconv/loop.c>
   894 /* Next, define the other direction.  */
   895 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
   896 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
   897 #define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
   898 #define LOOPFCT			TO_LOOP
   899 #define BODY \
   900   {									      \
   901     uint32_t ch = get32 (inptr);					      \
   902     const unsigned char *cp = "";						      \
   903     unsigned char b5ch[2] = "\0\0";					      \
   904     int i;								      \
   905     									      \
   906     for (i = 0;								      \
   907 	 i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0]));      \
   908 	 ++i)								      \
   909       {									      \
   910 	if (ch < from_ucs4_idx[i].from)					      \
   911 	  break;							      \
   912 	if (from_ucs4_idx[i].to >= ch)					      \
   913 	  {								      \
   914 	    cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset		      \
   915 			  + ch - from_ucs4_idx[i].from];		      \
   916 	    break;							      \
   917 	  }								      \
   918       }									      \
   919 									      \
   920     if (ch <= 0x80)							      \
   921       {									      \
   922 	b5ch[0] = ch;							      \
   923 	cp = b5ch;							      \
   924       }									      \
   925 									      \
   926     if (cp[0] == '\0' && ch != 0)					      \
   927       {									      \
   928 	UNICODE_TAG_HANDLER (ch, 4);					      \
   929 									      \
   930 	/* Illegal character.  */					      \
   931 	STANDARD_ERR_HANDLER (4);					      \
   932       }									      \
   933     else								      \
   934       {									      \
   935 	/* See whether there is enough room for the second byte we write.  */ \
   936 	if (__builtin_expect (cp[1], '\1') != '\0'			      \
   937 	    && __builtin_expect (outptr + 1 >= outend, 0))		      \
   938 	  {								      \
   939 	    /* We have not enough room.  */				      \
   940 	    result = __GCONV_FULL_OUTPUT;				      \
   941 	    break;							      \
   942 	  }								      \
   943 									      \
   944 	*outptr++ = cp[0];						      \
   945 	if (cp[1] != '\0')						      \
   946 	  *outptr++ = cp[1];						      \
   947       }									      \
   948 									      \
   949     inptr += 4;								      \
   950   }
   951 #define LOOP_NEED_FLAGS
   952 #include <iconv/loop.c>
   955 /* Now define the toplevel functions.  */
   956 #include <iconv/skeleton.c>
   957 EOT
   959 }

mercurial