diff -r 000000000000 -r 6474c204b198 intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,959 @@ +#!/usr/bin/perl -w +# +# gen-big5hkscs-2001-mozilla.pl +# a Perl script that generates Big5-HKSCS <-> Unicode +# conversion tables for Mozilla +# +# Author (of the original Perl script): +# Anthony Fok +# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. +# License: GNU General Public License, v2 or later. +# +# This version includes original C source code from +# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper +# Roger So +# +# First attempt for Qt-2.3.x: 2001-09-21 +# A working version for Qt-2.3.x: 2001-10-30 +# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21 +# Adapted to generate conversion tables for Mozilla: 2002-11-26 +# Adapted to generate conversion tables for Mozilla: 2002-11-30 +# Cleaned up the script somewhat: 2002-12-04 +# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10 +# +# Notes: +# +# 1. The latest version of this script may be found in: +# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl +# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl +# Or, better yet, e-mail me and ask for the latest version. +# +# 2. This script generates data from 3 tables: +# a. http://www.microsoft.com/typography/unicode/950.txt +# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt +# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt +# +# Make sure your big5-iso.txt is the latest HKSCS-2001 version. +# +# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into +# different areas similar to the way Ulrich and Roger did it, +# but extended for HKSCS-2001. +# +# 4. [Mozilla]: This script is very quick-and-dirty in some places. +# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate +# the appropriate tables for feeding into "fromu" or "tou". +# +# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized. +# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode. +# Otherwise, this script would generate a HKSCS table. +# (Yes, I know, I should clean up this script and make it more modular, +# and with command-line options or whatnot. I'll do that later. :-) +# +# If you have any questions or concerns, please feel free to contact me +# at Anthony Fok or :-) +# +# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK) +# for their generous support in this work. +# + +# 1. UDA3, 0x8840 - 0x8dfe +# 2. UDA2, 0x8e40 - 0xa0fe +# 3. VDA, 0xc6a1 - 0xc8fe + +#use Getopt::Std; + +my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count ); + +my $debug = 0; +my $hkscs_mode = 1; +my $kangxi = 0; +my $use_range = 0; +my $bmp_only = 1; + +# +# Subroutine Declaration +# +sub read_cp950(); +sub adjust_radicals(); +sub read_hkscs_main(); +sub read_hkscs_cmp(); +sub post_tuning(); +sub gen_charmapml(); +sub gen_check_b2u(); +sub gen_check_u2b(); +sub gen_mozilla_uf(); +sub gen_mozilla_ut(); +sub gen_glibc(); + +########################################################################### +# +# Main program +# + +# First, read Microsoft's CP950 as base Big5. +read_cp950 (); + +# Add mappings to Kangxi Radicals. +# The b2u direction is added only if $kangxi is not null. +adjust_radicals (); + +# Then, read the HKSCS table. +# Again, see the $hkscs_mode variable. +read_hkscs_main (); +read_hkscs_cmp () if $hkscs_mode; + +post_tuning (); + + +# Then, choose one of the following: +#gen_charmapml(); +gen_mozilla_uf(); +#gen_mozilla_ut(); +#gen_check_u2b(); +#gen_glibc(); + + +# End of program +exit 0; + + +############################################################################# +# +# Subroutines +# + +sub read_cp950() { + open( CP950, "950.txt" ) or die; + my $mode = 0; + while () { + s/\r//; + chomp; + next if /^$/; + last if /^ENDCODEPAGE/; + + if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) { + $mode = 1; + ( $count, $high ) = ( $1, $2 ); + $i = 0; + next; + } + if (/^WCTABLE (\d+)/) { + $mode = 2; + $count = $1; + $i = 0; + next; + } + next if $mode == 0; + + if ( $mode == 1 ) { + ( $low, $unicode, $comment ) = split "\t"; + $low =~ s/^0x//; + $unicode =~ s/^0x//; + $big5 = $high . $low; + $b2u{ uc($big5) } = uc($unicode); + if ( ++$i == $count ) { $mode = 0; $count = 0; next; } + } + + if ( $mode == 2 ) { + ( $unicode, $big5, $comment ) = split "\t"; + $unicode =~ s/^0x//; + $big5 =~ s/^0x//; + my $u = hex($unicode); + my $b = hex($big5); + + $u2b{ uc($unicode) } = uc($big5) unless + + # Skip Microsoft's over-generous (or over-zealous?) mappings + # "Faked" accented latin characters + ( $b <= 0xFF and $b != $u ) + + # "Faked" Ideographic Annotation ___ Mark + or ( $u >= 0x3192 and $u <= 0x319F ) + + # "Faked" Parenthesized Ideograph ___ + or ( $u >= 0x3220 and $u <= 0x3243 ) + + # "Faked" Circled Ideograph ___ except Circled Ideograph Correct + or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 ) + + # ¢F¢G¢D¡¦£g¡M + or ( $u == 0xA2 + or $u == 0xA3 + or $u == 0xA5 + or $u == 0xB4 + or $u == 0xB5 + or $u == 0xB8 ) + + # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê + or ( $u == 0x0305 # ??? + or $u == 0x2015 + or $u == 0x2016 + or $u == 0x2022 + or $u == 0x2024 + or $u == 0x2033 + or $u == 0x203E # ??? + or $u == 0x2216 + or $u == 0x2218 + or $u == 0x2263 + or $u == 0x2307 + or $u == 0x2609 + or $u == 0x2641 + or $u == 0x301C + or $u == 0x3030 ) + + # ¡s¡¥¡N + or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 ); + + if ( ++$i == $count ) { $mode = 0; $count = 0; next; } + } + } +} + +sub adjust_radicals() { + + # B5+C6BF - B5+C6D7: Radicals (?) + + # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible. + # + # Big5-HKSCS tends towards using the character in Unicode CJK Ideographs + # Note that HKSCS does not explicitly define + # B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×), + # but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4, + # mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively. + # + # As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5. + # However, it also maps B5+FBF4 (ûô) to U+5E7A. + $b2u{"C6BF"} = "2F02" if $kangxi; + $u2b{"2F02"} = "C6BF"; # Æ¿ + $b2u{"C6C0"} = "2F03" if $kangxi; + $u2b{"2F03"} = "C6C0"; # ÆÀ + $b2u{"C6C1"} = "2F05" if $kangxi; + $u2b{"2F05"} = "C6C1"; # ÆÁ + $b2u{"C6C2"} = "2F07" if $kangxi; + $u2b{"2F07"} = "C6C2"; # ÆÂ + $b2u{"C6C3"} = "2F0C" if $kangxi; + $u2b{"2F0C"} = "C6C3"; # ÆÃ + $b2u{"C6C4"} = "2F0D" if $kangxi; + $u2b{"2F0D"} = "C6C4"; # ÆÄ + $b2u{"C6C5"} = "2F0E" if $kangxi; + $u2b{"2F0E"} = "C6C5"; # ÆÅ + $b2u{"C6C6"} = "2F13" if $kangxi; + $u2b{"2F13"} = "C6C6"; # ÆÆ + $b2u{"C6C7"} = "2F16" if $kangxi; + $u2b{"2F16"} = "C6C7"; # ÆÇ + $b2u{"C6C8"} = "2F19" if $kangxi; + $u2b{"2F19"} = "C6C8"; # ÆÈ + $b2u{"C6C9"} = "2F1B" if $kangxi; + $u2b{"2F1B"} = "C6C9"; # ÆÉ + $b2u{"C6CA"} = "2F22" if $kangxi; + $u2b{"2F22"} = "C6CA"; # ÆÊ + $b2u{"C6CB"} = "2F27" if $kangxi; + $u2b{"2F27"} = "C6CB"; # ÆË + $b2u{"C6CC"} = "2F2E" if $kangxi; + $u2b{"2F2E"} = "C6CC"; # ÆÌ + $b2u{"C6CD"} = "2F33" if $kangxi; + $u2b{"2F33"} = "C6CD"; # ÆÍ + $b2u{"C6CE"} = "2F34" if $kangxi; + $u2b{"2F34"} = "C6CE"; # ÆÎ + $b2u{"C6CF"} = "2F35" if $kangxi; + $u2b{"2F35"} = "C6CF"; # ÆÏ + $b2u{"C6D0"} = "2F39" if $kangxi; + $u2b{"2F39"} = "C6D0"; # ÆÐ + $b2u{"C6D1"} = "2F3A" if $kangxi; + $u2b{"2F3A"} = "C6D1"; # ÆÑ + $b2u{"C6D2"} = "2F41" if $kangxi; + $u2b{"2F41"} = "C6D2"; # ÆÒ + $b2u{"C6D3"} = "2F46" if $kangxi; + $u2b{"2F46"} = "C6D3"; # ÆÓ + $b2u{"C6D4"} = "2F67" if $kangxi; + $u2b{"2F67"} = "C6D4"; # ÆÔ + $b2u{"C6D5"} = "2F68" if $kangxi; + $u2b{"2F68"} = "C6D5"; # ÆÕ + $b2u{"C6D6"} = "2FA1" if $kangxi; + $u2b{"2FA1"} = "C6D6"; # ÆÖ + $b2u{"C6D7"} = "2FAA" if $kangxi; + $u2b{"2FAA"} = "C6D7"; # Æ× +} + +sub read_hkscs_main() { + + open( B2U, ") { + next + unless +/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/; + ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 ); + + my $b = hex($big5); + + # For non-HKSCS mode, only take data in the VDA range (?) + next unless $hkscs_mode + + # Note that we don't go from B5+C6A1-B5+C6FE, but rather only + # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals) + # because C8D4-C8FE are not assigned in TW-BIG5 + # if we are to follow Arphic PL Big-5 fonts. (To be discussed) + or + ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) ) + or ( $b >= 0xF9D6 && $b <= 0xF9FE ); + + print STDERR + "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n" + if $debug + and defined( $b2u{$big5} ) + and $b2u{$big5} ne $iso2000; + + $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001 + unless !$hkscs_mode + and $b == 0xF9FE; + + # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to + # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively. + # Which is more correct? I don't know! (To be discussed) + + print STDERR + "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n" + if $debug + and defined( $u2b{$iso1993} ) + and $u2b{$iso1993} ne $big5; + + $u2b{$iso1993} = $big5; + + print STDERR + "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n" + if $debug + and defined( $u2b{$iso2000} ) + and $u2b{$iso2000} ne $big5; + + $u2b{$iso2000} = $big5; + + print STDERR + "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n" + if $debug + and defined( $u2b{$iso2001} ) + and $u2b{$iso2001} ne $big5; + + $u2b{$iso2001} = $big5; + } + close B2U; + +} # read_hkscs_main() + + +sub read_hkscs_cmp() { + + ########################################################################### + # Add Big5 compatibility coding... + # + # Stephan, here is the code segment that you may want to implement + # in your convertbig5hkscs2001.pl + # + open( B5CMP, ") { + if (/^=====/) { $mode = 1; next; } + next if $mode == 0; + last if $mode == 1 and /^\s+/; + chomp; + my ( $big5cmp, $big5 ) = split " "; + + $big5cmp = uc($big5cmp); + $big5 = uc($big5); + my $uni = $b2u{$big5}; + my $unicmp = $b2u{$big5cmp}; + + print STDERR + "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t" + if $debug; + $b2u{$big5cmp} = $uni; + $u2b{$unicmp} = $big5; + print STDERR + "Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n" + if $debug; + } + close B5CMP; +} # read_hkscs_cmp(); + + +sub post_tuning() { + + # And finally, fine-tuning... + for $i ( 0x00 .. 0x80 ) { + $big5 = $unicode = sprintf( "%04X", $i ); + $b2u{$big5} = $unicode; + } + + # Add Euro '£á' (I wonder why this 950.txt doesn't have it.) + $b2u{"A3E1"} = "20AC"; + $u2b{"20AC"} = "A3E1"; + + # Box drawing characters: + # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS) + # (To be discussed) + if ( !$hkscs_mode ) { + $u2b{"2550"} = "A2A4"; # Big5: ¢¤ (also B5-F9F9) + $u2b{"255E"} = "A2A5"; # Big5: ¢¥ (also B5-F9E9) + $u2b{"2561"} = "A2A7"; # Big5: ¢§ (also B5-F9EB) + $u2b{"256A"} = "A2A6"; # Big5: ¢¦ (also B5-F9EA) + $u2b{"256D"} = "A27E"; # Big5: ¢~ (also B5-F9FA) + $u2b{"256E"} = "A2A1"; # Big5: ¢¡ (also B5-F9FB) + $u2b{"256F"} = "A2A3"; # Big5: ¢£ (also B5-F9FD) + $u2b{"2570"} = "A2A2"; # Big5: ¢¢ (also B5-F9FC) + } + + # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î) + # (To be discussed) + if ( !$hkscs_mode ) { + $b2u{"A2CC"} = "3038"; + $u2b{"3038"} = "A2CC"; + $b2u{"A2CD"} = "3039"; + $u2b{"3039"} = "A2CD"; + $b2u{"A2CE"} = "303A"; + $u2b{"303A"} = "A2CE"; + } + + # The character for ethnic group "Yi" (ÂU): + # (To be discussed) + $u2b{"5F5E"} = "C255"; # Always add this. + if ( !$hkscs_mode ) { + $b2u{"C255"} = "5F5E"; + } + +} # post_tuning() + + +sub gen_charmapml() { + + ########################################################################### + # + # Codes for generating CharMapML XML file + + print < + +EOT + + if ($hkscs_mode) { + print < + + + Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) + with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, + and with some other manual tweaking. + + +EOT + } + else { + print < + + + Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) + with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, + and with some other manual tweaking. + + +EOT + } + + print < + + + + + + +EOT + print " \n"; + for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { + $big5 = $u2b{$unicode}; + $u = hex($unicode); + next + unless defined( $b2u{$big5} ) + and $unicode eq $b2u{$big5} + and + not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 ); + printf " \n", hex($big5); + } + else { + printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ), + substr( $big5, 2, 2 ); + } + } + + print " \n"; + for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { + $big5 = $u2b{$unicode}; + next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} ); + if ( $unicode eq "F900" ) { + print " \n"; + } + printf " \n", hex($unicode), + substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); + } + + my %fbu; + print " \n"; + for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { + $unicode = $b2u{$big5}; + if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) ) + { + $fbu{$unicode} = $big5; + } + } + for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) { + $big5 = $fbu{$unicode}; + printf " \n", hex($unicode), + substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); + } + + if ( $use_range and !$hkscs_mode ) { + print < tag for TW-BIG5. + Big-5E and Big5-HKSCS have assigned characters in these areas, + and we will have to use the and tags instead. + --> + + + + + + +EOT + } + + print < + +EOT + +} # gen_charmapml() + +sub gen_check_b2u() { + + ########################################################################### + # + # Codes for generating a raw table for verification and testing + # + # #print $u2b{"F7D1"}, "\n"; + # print $b2u{$u2b{"F7D1"}}, "\n"; + # print "FA59 -> U+", $b2u{"FA59"}, "\n"; + + foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { + $unicode = $b2u{$big5}; + $big5 =~ s/^00//; + print "U+", $unicode, ": ", $big5, "\n"; + } +} + +sub gen_check_u2b() { + foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { + $big5 = $u2b{$unicode}; + $big5 =~ s/^00//; + print "U+", $unicode, ": ", $big5, "\n"; + } + +} + +########################################################################### +# +# Codes for generating hkscs.ut and hkscs.uf files for Mozilla +# +sub gen_mozilla_uf() { + # hkscs.uf + foreach $unicode ( sort keys %u2b ) { + $big5 = $u2b{$unicode}; + my $b = hex($big5); + print "0x", uc($big5), "\t0x", uc($unicode), "\n" + unless ( $b >= 0xA140 and $b <= 0xC6A0 ) + or ( $b >= 0xC940 and $b <= 0xF9D5 ) + or ( $b < 0x8140 ) + or ( hex($unicode) > 0xFFFF ); + } +} + +sub gen_mozilla_ut() { + # hkscs.ut + foreach $big5 ( sort keys %b2u ) { + my $b = hex($big5); + print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n" + unless ( $b >= 0xA140 and $b <= 0xC6A0 ) + or ( $b < 0x8140 ) + or ( $b >= 0xC940 and $b <= 0xF9D5 ); + } +} + + +########################################################################### + +sub gen_glibc() { + + ########################################################################## + # + # Generate index for UCS4 to Big5-HKSCS conversion table + # + @index_array = (); + + $mode = 0; + $count = 0; + for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) { + $unicode = sprintf( "%04X", $uni ); + + # print " /* U+$unicode */\t" if $low % 4 == 0; + if ( defined( $u2b{$unicode} ) ) { + if ( $mode == 0 ) { + $range_start = $range_end = $uni; + + # printf " { %7s, ", sprintf("0x%04X", $range_start); + $mode = 1; + } + else { + $range_end = $uni; + } + } + elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) { + + # Start a new range if the gap is 0x80 or larger + # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count; + push @index_array, [ ( $range_start, $range_end, $count ) ]; + $count += $range_end - $range_start + 1; + $mode = 0; + } + } + + # + # Note that $count and $range_end are used again as global variables + # below + # + + ########################################################################### + # + # Start generating real C code... + # + + print <<'EOT'; +/* Mapping tables for Big5-HKSCS handling. + Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper , 1997. + Modified for Big5-HKSCS by Roger So , 2000. + Updated for HKSCS-2001 by James Su + and Anthony Fok , 2002 + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include + + +/* Table for Big5-HKSCS to UCS conversion. + + Original comments by Roger So when he updated the tables for HKSCS-1999: + + With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info: + http://www.digital21.gov.hk/eng/hkscs/index.html + - spacehunt 07/01/2000 + + The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt + and big5cmp.txt using a Perl script while merging C source code from + other developers. A copy of the source Perl script is available at: + + http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl + http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl + + Revisions: + 2001-10-30 made codec for Qt + 2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001 + + Todo: + Use a hash for characters beyond BMP to save space and make it + more efficient + + - Anthony Fok 21 Mar 2002 + On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China +*/ + +EOT + + ########################################################################## + # + # Generate Big5-HKSCS to Unicode conversion table + # + + ## print "Big5HKSCS to Unicode\n"; + + # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) { + + $high_start = 0x88; + $high_end = 0xfe; + + print "static const uint16_t big5_hkscs_to_ucs["; + print( ( $high_end - $high_start + 1 ) * 157 ); + print "] =\n{\n"; + for $high ( 0x88 .. 0xfe ) { + for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) { + if ( $low == 0x40 ) { + print "\n" unless $high == $high_start; + printf + "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n", + $high, $high, $high, $high; + } + elsif ( $low == 0xa1 ) { + print "\t\t"; + } + $big5 = sprintf( "%02X%02X", $high, $low ); + print "\t" if $low % 8 == 0; + if ( defined( $b2u{$big5} ) ) { + $unicode = $b2u{$big5}; + print "0x", $unicode, ","; + } + else { + print "0x0000,"; # for glibc + } + print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) + ? "\n" + : "\t" ); + } + } + print "};\n\n"; + + ########################################################################## + # + # Generate Unicode to Big5-HKSCS conversion table + # + print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n"; + foreach $index (@index_array) { + ( $start, $end ) = ( @$index[0], @$index[1] ); + printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 ); + print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 ); + for ( $i = $start ; $i <= $end ; $i++ ) { + printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 ); + $unicode = sprintf( "%04X", $i ); + if ( defined( $big5 = $u2b{$unicode} ) ) { + if ( $big5 =~ /^00/ ) { + print '"\x', substr( $big5, 2, 2 ), '\x00",'; + } + else { + print '"\x', substr( $big5, 0, 2 ), '\x', + substr( $big5, 2, 2 ), '",'; + } + } + else { + print '"\x00\x00",'; + } + print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end; + } + print $end == $range_end ? "\n" : "\n\n"; + } + print "};\n\n"; + + ########################################################################### + + print <= 0x81 && ch <= 0xfe) \ + { \ + /* Two-byte character. First test whether the next character \ + is also available. */ \ + uint32_t ch2; \ + int idx; \ + \ + if (__builtin_expect (inptr + 1 >= inend, 0)) \ + { \ + /* The second character is not available. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + ch2 = inptr[1]; \ + /* See whether the second byte is in the correct range. */ \ + if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \ + { \ + if (ch >= 0x88) \ + { \ + /* Look up the table */ \ + idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \ + if ((ch = big5_hkscs_to_ucs[idx]) == 0) \ + { \ + /* This is illegal. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + continue; \ + } \ + } \ + else \ + { \ + /* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \ + ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \ + + 0xeeb8; \ + } \ + } \ + else \ + { \ + /* This is illegal. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + continue; \ + } \ + \ + inptr += 2; \ + } \ + else if (__builtin_expect (ch, 0) == 0xff) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + else /* 0x00 to 0x80 */ \ + ++inptr; \ + \ + put32 (outptr, ch); \ + outptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include + + +/* Next, define the other direction. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t ch = get32 (inptr); \ + const unsigned char *cp = ""; \ + unsigned char b5ch[2] = "\0\0"; \ + int i; \ + \ + for (i = 0; \ + i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \ + ++i) \ + { \ + if (ch < from_ucs4_idx[i].from) \ + break; \ + if (from_ucs4_idx[i].to >= ch) \ + { \ + cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \ + + ch - from_ucs4_idx[i].from]; \ + break; \ + } \ + } \ + \ + if (ch <= 0x80) \ + { \ + b5ch[0] = ch; \ + cp = b5ch; \ + } \ + \ + if (cp[0] == '\0' && ch != 0) \ + { \ + UNICODE_TAG_HANDLER (ch, 4); \ + \ + /* Illegal character. */ \ + STANDARD_ERR_HANDLER (4); \ + } \ + else \ + { \ + /* See whether there is enough room for the second byte we write. */ \ + if (__builtin_expect (cp[1], '\1') != '\0' \ + && __builtin_expect (outptr + 1 >= outend, 0)) \ + { \ + /* We have not enough room. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = cp[0]; \ + if (cp[1] != '\0') \ + *outptr++ = cp[1]; \ + } \ + \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include + + +/* Now define the toplevel functions. */ +#include +EOT + +}