Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | #!/usr/bin/perl -w |
michael@0 | 2 | # |
michael@0 | 3 | # gen-big5hkscs-2001-mozilla.pl |
michael@0 | 4 | # a Perl script that generates Big5-HKSCS <-> Unicode |
michael@0 | 5 | # conversion tables for Mozilla |
michael@0 | 6 | # |
michael@0 | 7 | # Author (of the original Perl script): |
michael@0 | 8 | # Anthony Fok <anthony@thizlinux.com> <foka@debian.org> |
michael@0 | 9 | # Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. |
michael@0 | 10 | # License: GNU General Public License, v2 or later. |
michael@0 | 11 | # |
michael@0 | 12 | # This version includes original C source code from |
michael@0 | 13 | # glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com> |
michael@0 | 14 | # Roger So <roger.so@sw-linux.com> |
michael@0 | 15 | # |
michael@0 | 16 | # First attempt for Qt-2.3.x: 2001-09-21 |
michael@0 | 17 | # A working version for Qt-2.3.x: 2001-10-30 |
michael@0 | 18 | # Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21 |
michael@0 | 19 | # Adapted to generate conversion tables for Mozilla: 2002-11-26 |
michael@0 | 20 | # Adapted to generate conversion tables for Mozilla: 2002-11-30 |
michael@0 | 21 | # Cleaned up the script somewhat: 2002-12-04 |
michael@0 | 22 | # Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10 |
michael@0 | 23 | # |
michael@0 | 24 | # Notes: |
michael@0 | 25 | # |
michael@0 | 26 | # 1. The latest version of this script may be found in: |
michael@0 | 27 | # http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl |
michael@0 | 28 | # http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl |
michael@0 | 29 | # Or, better yet, e-mail me and ask for the latest version. |
michael@0 | 30 | # |
michael@0 | 31 | # 2. This script generates data from 3 tables: |
michael@0 | 32 | # a. http://www.microsoft.com/typography/unicode/950.txt |
michael@0 | 33 | # b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt |
michael@0 | 34 | # c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt |
michael@0 | 35 | # |
michael@0 | 36 | # Make sure your big5-iso.txt is the latest HKSCS-2001 version. |
michael@0 | 37 | # |
michael@0 | 38 | # 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into |
michael@0 | 39 | # different areas similar to the way Ulrich and Roger did it, |
michael@0 | 40 | # but extended for HKSCS-2001. |
michael@0 | 41 | # |
michael@0 | 42 | # 4. [Mozilla]: This script is very quick-and-dirty in some places. |
michael@0 | 43 | # Call either gen_mozilla_uf() or gen_mozilla_ut() to generate |
michael@0 | 44 | # the appropriate tables for feeding into "fromu" or "tou". |
michael@0 | 45 | # |
michael@0 | 46 | # 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized. |
michael@0 | 47 | # Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode. |
michael@0 | 48 | # Otherwise, this script would generate a HKSCS table. |
michael@0 | 49 | # (Yes, I know, I should clean up this script and make it more modular, |
michael@0 | 50 | # and with command-line options or whatnot. I'll do that later. :-) |
michael@0 | 51 | # |
michael@0 | 52 | # If you have any questions or concerns, please feel free to contact me |
michael@0 | 53 | # at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-) |
michael@0 | 54 | # |
michael@0 | 55 | # Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK) |
michael@0 | 56 | # for their generous support in this work. |
michael@0 | 57 | # |
michael@0 | 58 | |
michael@0 | 59 | # 1. UDA3, 0x8840 - 0x8dfe |
michael@0 | 60 | # 2. UDA2, 0x8e40 - 0xa0fe |
michael@0 | 61 | # 3. VDA, 0xc6a1 - 0xc8fe |
michael@0 | 62 | |
michael@0 | 63 | #use Getopt::Std; |
michael@0 | 64 | |
michael@0 | 65 | my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count ); |
michael@0 | 66 | |
michael@0 | 67 | my $debug = 0; |
michael@0 | 68 | my $hkscs_mode = 1; |
michael@0 | 69 | my $kangxi = 0; |
michael@0 | 70 | my $use_range = 0; |
michael@0 | 71 | my $bmp_only = 1; |
michael@0 | 72 | |
michael@0 | 73 | # |
michael@0 | 74 | # Subroutine Declaration |
michael@0 | 75 | # |
michael@0 | 76 | sub read_cp950(); |
michael@0 | 77 | sub adjust_radicals(); |
michael@0 | 78 | sub read_hkscs_main(); |
michael@0 | 79 | sub read_hkscs_cmp(); |
michael@0 | 80 | sub post_tuning(); |
michael@0 | 81 | sub gen_charmapml(); |
michael@0 | 82 | sub gen_check_b2u(); |
michael@0 | 83 | sub gen_check_u2b(); |
michael@0 | 84 | sub gen_mozilla_uf(); |
michael@0 | 85 | sub gen_mozilla_ut(); |
michael@0 | 86 | sub gen_glibc(); |
michael@0 | 87 | |
michael@0 | 88 | ########################################################################### |
michael@0 | 89 | # |
michael@0 | 90 | # Main program |
michael@0 | 91 | # |
michael@0 | 92 | |
michael@0 | 93 | # First, read Microsoft's CP950 as base Big5. |
michael@0 | 94 | read_cp950 (); |
michael@0 | 95 | |
michael@0 | 96 | # Add mappings to Kangxi Radicals. |
michael@0 | 97 | # The b2u direction is added only if $kangxi is not null. |
michael@0 | 98 | adjust_radicals (); |
michael@0 | 99 | |
michael@0 | 100 | # Then, read the HKSCS table. |
michael@0 | 101 | # Again, see the $hkscs_mode variable. |
michael@0 | 102 | read_hkscs_main (); |
michael@0 | 103 | read_hkscs_cmp () if $hkscs_mode; |
michael@0 | 104 | |
michael@0 | 105 | post_tuning (); |
michael@0 | 106 | |
michael@0 | 107 | |
michael@0 | 108 | # Then, choose one of the following: |
michael@0 | 109 | #gen_charmapml(); |
michael@0 | 110 | gen_mozilla_uf(); |
michael@0 | 111 | #gen_mozilla_ut(); |
michael@0 | 112 | #gen_check_u2b(); |
michael@0 | 113 | #gen_glibc(); |
michael@0 | 114 | |
michael@0 | 115 | |
michael@0 | 116 | # End of program |
michael@0 | 117 | exit 0; |
michael@0 | 118 | |
michael@0 | 119 | |
michael@0 | 120 | ############################################################################# |
michael@0 | 121 | # |
michael@0 | 122 | # Subroutines |
michael@0 | 123 | # |
michael@0 | 124 | |
michael@0 | 125 | sub read_cp950() { |
michael@0 | 126 | open( CP950, "950.txt" ) or die; |
michael@0 | 127 | my $mode = 0; |
michael@0 | 128 | while (<CP950>) { |
michael@0 | 129 | s/\r//; |
michael@0 | 130 | chomp; |
michael@0 | 131 | next if /^$/; |
michael@0 | 132 | last if /^ENDCODEPAGE/; |
michael@0 | 133 | |
michael@0 | 134 | if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) { |
michael@0 | 135 | $mode = 1; |
michael@0 | 136 | ( $count, $high ) = ( $1, $2 ); |
michael@0 | 137 | $i = 0; |
michael@0 | 138 | next; |
michael@0 | 139 | } |
michael@0 | 140 | if (/^WCTABLE (\d+)/) { |
michael@0 | 141 | $mode = 2; |
michael@0 | 142 | $count = $1; |
michael@0 | 143 | $i = 0; |
michael@0 | 144 | next; |
michael@0 | 145 | } |
michael@0 | 146 | next if $mode == 0; |
michael@0 | 147 | |
michael@0 | 148 | if ( $mode == 1 ) { |
michael@0 | 149 | ( $low, $unicode, $comment ) = split "\t"; |
michael@0 | 150 | $low =~ s/^0x//; |
michael@0 | 151 | $unicode =~ s/^0x//; |
michael@0 | 152 | $big5 = $high . $low; |
michael@0 | 153 | $b2u{ uc($big5) } = uc($unicode); |
michael@0 | 154 | if ( ++$i == $count ) { $mode = 0; $count = 0; next; } |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | if ( $mode == 2 ) { |
michael@0 | 158 | ( $unicode, $big5, $comment ) = split "\t"; |
michael@0 | 159 | $unicode =~ s/^0x//; |
michael@0 | 160 | $big5 =~ s/^0x//; |
michael@0 | 161 | my $u = hex($unicode); |
michael@0 | 162 | my $b = hex($big5); |
michael@0 | 163 | |
michael@0 | 164 | $u2b{ uc($unicode) } = uc($big5) unless |
michael@0 | 165 | |
michael@0 | 166 | # Skip Microsoft's over-generous (or over-zealous?) mappings |
michael@0 | 167 | # "Faked" accented latin characters |
michael@0 | 168 | ( $b <= 0xFF and $b != $u ) |
michael@0 | 169 | |
michael@0 | 170 | # "Faked" Ideographic Annotation ___ Mark |
michael@0 | 171 | or ( $u >= 0x3192 and $u <= 0x319F ) |
michael@0 | 172 | |
michael@0 | 173 | # "Faked" Parenthesized Ideograph ___ |
michael@0 | 174 | or ( $u >= 0x3220 and $u <= 0x3243 ) |
michael@0 | 175 | |
michael@0 | 176 | # "Faked" Circled Ideograph ___ except Circled Ideograph Correct |
michael@0 | 177 | or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 ) |
michael@0 | 178 | |
michael@0 | 179 | # ¢F¢G¢D¡¦£g¡M |
michael@0 | 180 | or ( $u == 0xA2 |
michael@0 | 181 | or $u == 0xA3 |
michael@0 | 182 | or $u == 0xA5 |
michael@0 | 183 | or $u == 0xB4 |
michael@0 | 184 | or $u == 0xB5 |
michael@0 | 185 | or $u == 0xB8 ) |
michael@0 | 186 | |
michael@0 | 187 | # ¡Â¢w¡ü¡E£»¡²¡Ã¢B¢X¡Ý¡[¡ó¡ò¡ã¡Ê |
michael@0 | 188 | or ( $u == 0x0305 # ??? |
michael@0 | 189 | or $u == 0x2015 |
michael@0 | 190 | or $u == 0x2016 |
michael@0 | 191 | or $u == 0x2022 |
michael@0 | 192 | or $u == 0x2024 |
michael@0 | 193 | or $u == 0x2033 |
michael@0 | 194 | or $u == 0x203E # ??? |
michael@0 | 195 | or $u == 0x2216 |
michael@0 | 196 | or $u == 0x2218 |
michael@0 | 197 | or $u == 0x2263 |
michael@0 | 198 | or $u == 0x2307 |
michael@0 | 199 | or $u == 0x2609 |
michael@0 | 200 | or $u == 0x2641 |
michael@0 | 201 | or $u == 0x301C |
michael@0 | 202 | or $u == 0x3030 ) |
michael@0 | 203 | |
michael@0 | 204 | # ¡s¡¥¡N |
michael@0 | 205 | or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 ); |
michael@0 | 206 | |
michael@0 | 207 | if ( ++$i == $count ) { $mode = 0; $count = 0; next; } |
michael@0 | 208 | } |
michael@0 | 209 | } |
michael@0 | 210 | } |
michael@0 | 211 | |
michael@0 | 212 | sub adjust_radicals() { |
michael@0 | 213 | |
michael@0 | 214 | # B5+C6BF - B5+C6D7: Radicals (?) |
michael@0 | 215 | |
michael@0 | 216 | # TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible. |
michael@0 | 217 | # |
michael@0 | 218 | # Big5-HKSCS tends towards using the character in Unicode CJK Ideographs |
michael@0 | 219 | # Note that HKSCS does not explicitly define |
michael@0 | 220 | # B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (ÆÏ¡BÆÓ¡BÆÕ¡BÆ×), |
michael@0 | 221 | # but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4, |
michael@0 | 222 | # mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively. |
michael@0 | 223 | # |
michael@0 | 224 | # As for B5+C6CD (ÆÍ), HKSCS maps it to U+2F33 just like TW-BIG5. |
michael@0 | 225 | # However, it also maps B5+FBF4 (ûô) to U+5E7A. |
michael@0 | 226 | $b2u{"C6BF"} = "2F02" if $kangxi; |
michael@0 | 227 | $u2b{"2F02"} = "C6BF"; # Æ¿ |
michael@0 | 228 | $b2u{"C6C0"} = "2F03" if $kangxi; |
michael@0 | 229 | $u2b{"2F03"} = "C6C0"; # ÆÀ |
michael@0 | 230 | $b2u{"C6C1"} = "2F05" if $kangxi; |
michael@0 | 231 | $u2b{"2F05"} = "C6C1"; # ÆÁ |
michael@0 | 232 | $b2u{"C6C2"} = "2F07" if $kangxi; |
michael@0 | 233 | $u2b{"2F07"} = "C6C2"; # ÆÂ |
michael@0 | 234 | $b2u{"C6C3"} = "2F0C" if $kangxi; |
michael@0 | 235 | $u2b{"2F0C"} = "C6C3"; # ÆÃ |
michael@0 | 236 | $b2u{"C6C4"} = "2F0D" if $kangxi; |
michael@0 | 237 | $u2b{"2F0D"} = "C6C4"; # ÆÄ |
michael@0 | 238 | $b2u{"C6C5"} = "2F0E" if $kangxi; |
michael@0 | 239 | $u2b{"2F0E"} = "C6C5"; # ÆÅ |
michael@0 | 240 | $b2u{"C6C6"} = "2F13" if $kangxi; |
michael@0 | 241 | $u2b{"2F13"} = "C6C6"; # ÆÆ |
michael@0 | 242 | $b2u{"C6C7"} = "2F16" if $kangxi; |
michael@0 | 243 | $u2b{"2F16"} = "C6C7"; # ÆÇ |
michael@0 | 244 | $b2u{"C6C8"} = "2F19" if $kangxi; |
michael@0 | 245 | $u2b{"2F19"} = "C6C8"; # ÆÈ |
michael@0 | 246 | $b2u{"C6C9"} = "2F1B" if $kangxi; |
michael@0 | 247 | $u2b{"2F1B"} = "C6C9"; # ÆÉ |
michael@0 | 248 | $b2u{"C6CA"} = "2F22" if $kangxi; |
michael@0 | 249 | $u2b{"2F22"} = "C6CA"; # ÆÊ |
michael@0 | 250 | $b2u{"C6CB"} = "2F27" if $kangxi; |
michael@0 | 251 | $u2b{"2F27"} = "C6CB"; # ÆË |
michael@0 | 252 | $b2u{"C6CC"} = "2F2E" if $kangxi; |
michael@0 | 253 | $u2b{"2F2E"} = "C6CC"; # ÆÌ |
michael@0 | 254 | $b2u{"C6CD"} = "2F33" if $kangxi; |
michael@0 | 255 | $u2b{"2F33"} = "C6CD"; # ÆÍ |
michael@0 | 256 | $b2u{"C6CE"} = "2F34" if $kangxi; |
michael@0 | 257 | $u2b{"2F34"} = "C6CE"; # ÆÎ |
michael@0 | 258 | $b2u{"C6CF"} = "2F35" if $kangxi; |
michael@0 | 259 | $u2b{"2F35"} = "C6CF"; # ÆÏ |
michael@0 | 260 | $b2u{"C6D0"} = "2F39" if $kangxi; |
michael@0 | 261 | $u2b{"2F39"} = "C6D0"; # ÆÐ |
michael@0 | 262 | $b2u{"C6D1"} = "2F3A" if $kangxi; |
michael@0 | 263 | $u2b{"2F3A"} = "C6D1"; # ÆÑ |
michael@0 | 264 | $b2u{"C6D2"} = "2F41" if $kangxi; |
michael@0 | 265 | $u2b{"2F41"} = "C6D2"; # ÆÒ |
michael@0 | 266 | $b2u{"C6D3"} = "2F46" if $kangxi; |
michael@0 | 267 | $u2b{"2F46"} = "C6D3"; # ÆÓ |
michael@0 | 268 | $b2u{"C6D4"} = "2F67" if $kangxi; |
michael@0 | 269 | $u2b{"2F67"} = "C6D4"; # ÆÔ |
michael@0 | 270 | $b2u{"C6D5"} = "2F68" if $kangxi; |
michael@0 | 271 | $u2b{"2F68"} = "C6D5"; # ÆÕ |
michael@0 | 272 | $b2u{"C6D6"} = "2FA1" if $kangxi; |
michael@0 | 273 | $u2b{"2FA1"} = "C6D6"; # ÆÖ |
michael@0 | 274 | $b2u{"C6D7"} = "2FAA" if $kangxi; |
michael@0 | 275 | $u2b{"2FAA"} = "C6D7"; # Æ× |
michael@0 | 276 | } |
michael@0 | 277 | |
michael@0 | 278 | sub read_hkscs_main() { |
michael@0 | 279 | |
michael@0 | 280 | open( B2U, "<big5-iso.txt" ) or die; |
michael@0 | 281 | while (<B2U>) { |
michael@0 | 282 | next |
michael@0 | 283 | unless |
michael@0 | 284 | /([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/; |
michael@0 | 285 | ( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 ); |
michael@0 | 286 | |
michael@0 | 287 | my $b = hex($big5); |
michael@0 | 288 | |
michael@0 | 289 | # For non-HKSCS mode, only take data in the VDA range (?) |
michael@0 | 290 | next unless $hkscs_mode |
michael@0 | 291 | |
michael@0 | 292 | # Note that we don't go from B5+C6A1-B5+C6FE, but rather only |
michael@0 | 293 | # C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals) |
michael@0 | 294 | # because C8D4-C8FE are not assigned in TW-BIG5 |
michael@0 | 295 | # if we are to follow Arphic PL Big-5 fonts. (To be discussed) |
michael@0 | 296 | or |
michael@0 | 297 | ( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) ) |
michael@0 | 298 | or ( $b >= 0xF9D6 && $b <= 0xF9FE ); |
michael@0 | 299 | |
michael@0 | 300 | print STDERR |
michael@0 | 301 | "B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n" |
michael@0 | 302 | if $debug |
michael@0 | 303 | and defined( $b2u{$big5} ) |
michael@0 | 304 | and $b2u{$big5} ne $iso2000; |
michael@0 | 305 | |
michael@0 | 306 | $b2u{$big5} = $bmp_only ? $iso2000 : $iso2001 |
michael@0 | 307 | unless !$hkscs_mode |
michael@0 | 308 | and $b == 0xF9FE; |
michael@0 | 309 | |
michael@0 | 310 | # B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to |
michael@0 | 311 | # U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively. |
michael@0 | 312 | # Which is more correct? I don't know! (To be discussed) |
michael@0 | 313 | |
michael@0 | 314 | print STDERR |
michael@0 | 315 | "1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n" |
michael@0 | 316 | if $debug |
michael@0 | 317 | and defined( $u2b{$iso1993} ) |
michael@0 | 318 | and $u2b{$iso1993} ne $big5; |
michael@0 | 319 | |
michael@0 | 320 | $u2b{$iso1993} = $big5; |
michael@0 | 321 | |
michael@0 | 322 | print STDERR |
michael@0 | 323 | "2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n" |
michael@0 | 324 | if $debug |
michael@0 | 325 | and defined( $u2b{$iso2000} ) |
michael@0 | 326 | and $u2b{$iso2000} ne $big5; |
michael@0 | 327 | |
michael@0 | 328 | $u2b{$iso2000} = $big5; |
michael@0 | 329 | |
michael@0 | 330 | print STDERR |
michael@0 | 331 | "2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n" |
michael@0 | 332 | if $debug |
michael@0 | 333 | and defined( $u2b{$iso2001} ) |
michael@0 | 334 | and $u2b{$iso2001} ne $big5; |
michael@0 | 335 | |
michael@0 | 336 | $u2b{$iso2001} = $big5; |
michael@0 | 337 | } |
michael@0 | 338 | close B2U; |
michael@0 | 339 | |
michael@0 | 340 | } # read_hkscs_main() |
michael@0 | 341 | |
michael@0 | 342 | |
michael@0 | 343 | sub read_hkscs_cmp() { |
michael@0 | 344 | |
michael@0 | 345 | ########################################################################### |
michael@0 | 346 | # Add Big5 compatibility coding... |
michael@0 | 347 | # |
michael@0 | 348 | # Stephan, here is the code segment that you may want to implement |
michael@0 | 349 | # in your convertbig5hkscs2001.pl |
michael@0 | 350 | # |
michael@0 | 351 | open( B5CMP, "<big5cmp.txt" ) or die; |
michael@0 | 352 | $mode = 0; |
michael@0 | 353 | while (<B5CMP>) { |
michael@0 | 354 | if (/^=====/) { $mode = 1; next; } |
michael@0 | 355 | next if $mode == 0; |
michael@0 | 356 | last if $mode == 1 and /^\s+/; |
michael@0 | 357 | chomp; |
michael@0 | 358 | my ( $big5cmp, $big5 ) = split " "; |
michael@0 | 359 | |
michael@0 | 360 | $big5cmp = uc($big5cmp); |
michael@0 | 361 | $big5 = uc($big5); |
michael@0 | 362 | my $uni = $b2u{$big5}; |
michael@0 | 363 | my $unicmp = $b2u{$big5cmp}; |
michael@0 | 364 | |
michael@0 | 365 | print STDERR |
michael@0 | 366 | "Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t" |
michael@0 | 367 | if $debug; |
michael@0 | 368 | $b2u{$big5cmp} = $uni; |
michael@0 | 369 | $u2b{$unicmp} = $big5; |
michael@0 | 370 | print STDERR |
michael@0 | 371 | "Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n" |
michael@0 | 372 | if $debug; |
michael@0 | 373 | } |
michael@0 | 374 | close B5CMP; |
michael@0 | 375 | } # read_hkscs_cmp(); |
michael@0 | 376 | |
michael@0 | 377 | |
michael@0 | 378 | sub post_tuning() { |
michael@0 | 379 | |
michael@0 | 380 | # And finally, fine-tuning... |
michael@0 | 381 | for $i ( 0x00 .. 0x80 ) { |
michael@0 | 382 | $big5 = $unicode = sprintf( "%04X", $i ); |
michael@0 | 383 | $b2u{$big5} = $unicode; |
michael@0 | 384 | } |
michael@0 | 385 | |
michael@0 | 386 | # Add Euro '£á' (I wonder why this 950.txt doesn't have it.) |
michael@0 | 387 | $b2u{"A3E1"} = "20AC"; |
michael@0 | 388 | $u2b{"20AC"} = "A3E1"; |
michael@0 | 389 | |
michael@0 | 390 | # Box drawing characters: |
michael@0 | 391 | # Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS) |
michael@0 | 392 | # (To be discussed) |
michael@0 | 393 | if ( !$hkscs_mode ) { |
michael@0 | 394 | $u2b{"2550"} = "A2A4"; # Big5: ¢¤ (also B5-F9F9) |
michael@0 | 395 | $u2b{"255E"} = "A2A5"; # Big5: ¢¥ (also B5-F9E9) |
michael@0 | 396 | $u2b{"2561"} = "A2A7"; # Big5: ¢§ (also B5-F9EB) |
michael@0 | 397 | $u2b{"256A"} = "A2A6"; # Big5: ¢¦ (also B5-F9EA) |
michael@0 | 398 | $u2b{"256D"} = "A27E"; # Big5: ¢~ (also B5-F9FA) |
michael@0 | 399 | $u2b{"256E"} = "A2A1"; # Big5: ¢¡ (also B5-F9FB) |
michael@0 | 400 | $u2b{"256F"} = "A2A3"; # Big5: ¢£ (also B5-F9FD) |
michael@0 | 401 | $u2b{"2570"} = "A2A2"; # Big5: ¢¢ (also B5-F9FC) |
michael@0 | 402 | } |
michael@0 | 403 | |
michael@0 | 404 | # "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (¢Ì¢Í¢Î) |
michael@0 | 405 | # (To be discussed) |
michael@0 | 406 | if ( !$hkscs_mode ) { |
michael@0 | 407 | $b2u{"A2CC"} = "3038"; |
michael@0 | 408 | $u2b{"3038"} = "A2CC"; |
michael@0 | 409 | $b2u{"A2CD"} = "3039"; |
michael@0 | 410 | $u2b{"3039"} = "A2CD"; |
michael@0 | 411 | $b2u{"A2CE"} = "303A"; |
michael@0 | 412 | $u2b{"303A"} = "A2CE"; |
michael@0 | 413 | } |
michael@0 | 414 | |
michael@0 | 415 | # The character for ethnic group "Yi" (ÂU): |
michael@0 | 416 | # (To be discussed) |
michael@0 | 417 | $u2b{"5F5E"} = "C255"; # Always add this. |
michael@0 | 418 | if ( !$hkscs_mode ) { |
michael@0 | 419 | $b2u{"C255"} = "5F5E"; |
michael@0 | 420 | } |
michael@0 | 421 | |
michael@0 | 422 | } # post_tuning() |
michael@0 | 423 | |
michael@0 | 424 | |
michael@0 | 425 | sub gen_charmapml() { |
michael@0 | 426 | |
michael@0 | 427 | ########################################################################### |
michael@0 | 428 | # |
michael@0 | 429 | # Codes for generating CharMapML XML file |
michael@0 | 430 | |
michael@0 | 431 | print <<EOT; |
michael@0 | 432 | <?xml version="1.0" encoding="UTF-8" ?> |
michael@0 | 433 | <!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd"> |
michael@0 | 434 | EOT |
michael@0 | 435 | |
michael@0 | 436 | if ($hkscs_mode) { |
michael@0 | 437 | print <<EOT; |
michael@0 | 438 | <characterMapping id="big5-hkscs-2001" version="1"> |
michael@0 | 439 | <history> |
michael@0 | 440 | <modified version="1" date="2002-11-30"> |
michael@0 | 441 | Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) |
michael@0 | 442 | with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, |
michael@0 | 443 | and with some other manual tweaking. |
michael@0 | 444 | </modified> |
michael@0 | 445 | </history> |
michael@0 | 446 | EOT |
michael@0 | 447 | } |
michael@0 | 448 | else { |
michael@0 | 449 | print <<EOT; |
michael@0 | 450 | <characterMapping id="tw-big5-2002" version="1"> |
michael@0 | 451 | <history> |
michael@0 | 452 | <modified version="1" date="2002-11-30"> |
michael@0 | 453 | Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001) |
michael@0 | 454 | with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed, |
michael@0 | 455 | and with some other manual tweaking. |
michael@0 | 456 | </modified> |
michael@0 | 457 | </history> |
michael@0 | 458 | EOT |
michael@0 | 459 | } |
michael@0 | 460 | |
michael@0 | 461 | print <<EOT; |
michael@0 | 462 | <validity> |
michael@0 | 463 | <state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/> |
michael@0 | 464 | <state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/> |
michael@0 | 465 | <state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/> |
michael@0 | 466 | <state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/> |
michael@0 | 467 | </validity> |
michael@0 | 468 | <assignments sub="3F"> |
michael@0 | 469 | EOT |
michael@0 | 470 | print " <!-- One to one mappings -->\n"; |
michael@0 | 471 | for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
michael@0 | 472 | $big5 = $u2b{$unicode}; |
michael@0 | 473 | $u = hex($unicode); |
michael@0 | 474 | next |
michael@0 | 475 | unless defined( $b2u{$big5} ) |
michael@0 | 476 | and $unicode eq $b2u{$big5} |
michael@0 | 477 | and |
michael@0 | 478 | not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 ); |
michael@0 | 479 | printf " <a u=\"%04X\" ", $u; |
michael@0 | 480 | if ( hex($big5) <= 0xFF ) { |
michael@0 | 481 | printf "b=\"%02X\"/>\n", hex($big5); |
michael@0 | 482 | } |
michael@0 | 483 | else { |
michael@0 | 484 | printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ), |
michael@0 | 485 | substr( $big5, 2, 2 ); |
michael@0 | 486 | } |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | print " <!-- Fallback mappings from Unicode to bytes -->\n"; |
michael@0 | 490 | for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
michael@0 | 491 | $big5 = $u2b{$unicode}; |
michael@0 | 492 | next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} ); |
michael@0 | 493 | if ( $unicode eq "F900" ) { |
michael@0 | 494 | print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n"; |
michael@0 | 495 | |
michael@0 | 496 | " These are included in CP950 (Unicode->Big5 direction only).\n"; |
michael@0 | 497 | print " Should we include this area in TW-BIG5 or not? -->\n"; |
michael@0 | 498 | } |
michael@0 | 499 | printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), |
michael@0 | 500 | substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); |
michael@0 | 501 | } |
michael@0 | 502 | |
michael@0 | 503 | my %fbu; |
michael@0 | 504 | print " <!-- Fallback mappings from bytes to Unicode -->\n"; |
michael@0 | 505 | for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { |
michael@0 | 506 | $unicode = $b2u{$big5}; |
michael@0 | 507 | if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) ) |
michael@0 | 508 | { |
michael@0 | 509 | $fbu{$unicode} = $big5; |
michael@0 | 510 | } |
michael@0 | 511 | } |
michael@0 | 512 | for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) { |
michael@0 | 513 | $big5 = $fbu{$unicode}; |
michael@0 | 514 | printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode), |
michael@0 | 515 | substr( $big5, 0, 2 ), substr( $big5, 2, 2 ); |
michael@0 | 516 | } |
michael@0 | 517 | |
michael@0 | 518 | if ( $use_range and !$hkscs_mode ) { |
michael@0 | 519 | print <<EOT; |
michael@0 | 520 | <!-- Roundtrip-mappings that can be enumerated |
michael@0 | 521 | Note: We can only use the <range> tag for TW-BIG5. |
michael@0 | 522 | Big-5E and Big5-HKSCS have assigned characters in these areas, |
michael@0 | 523 | and we will have to use the <a> and <fub> tags instead. |
michael@0 | 524 | --> |
michael@0 | 525 | <!-- User-Defined Area 1 (UDA1) --> |
michael@0 | 526 | <range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/> |
michael@0 | 527 | <!-- User-Defined Area 2 (UDA2) --> |
michael@0 | 528 | <range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/> |
michael@0 | 529 | <!-- User-Defined Area 3 (UDA3) --> |
michael@0 | 530 | <range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/> |
michael@0 | 531 | EOT |
michael@0 | 532 | } |
michael@0 | 533 | |
michael@0 | 534 | print <<EOT; |
michael@0 | 535 | </assignments> |
michael@0 | 536 | </characterMapping> |
michael@0 | 537 | EOT |
michael@0 | 538 | |
michael@0 | 539 | } # gen_charmapml() |
michael@0 | 540 | |
michael@0 | 541 | sub gen_check_b2u() { |
michael@0 | 542 | |
michael@0 | 543 | ########################################################################### |
michael@0 | 544 | # |
michael@0 | 545 | # Codes for generating a raw table for verification and testing |
michael@0 | 546 | # |
michael@0 | 547 | # #print $u2b{"F7D1"}, "\n"; |
michael@0 | 548 | # print $b2u{$u2b{"F7D1"}}, "\n"; |
michael@0 | 549 | # print "FA59 -> U+", $b2u{"FA59"}, "\n"; |
michael@0 | 550 | |
michael@0 | 551 | foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) { |
michael@0 | 552 | $unicode = $b2u{$big5}; |
michael@0 | 553 | $big5 =~ s/^00//; |
michael@0 | 554 | print "U+", $unicode, ": ", $big5, "\n"; |
michael@0 | 555 | } |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | sub gen_check_u2b() { |
michael@0 | 559 | foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) { |
michael@0 | 560 | $big5 = $u2b{$unicode}; |
michael@0 | 561 | $big5 =~ s/^00//; |
michael@0 | 562 | print "U+", $unicode, ": ", $big5, "\n"; |
michael@0 | 563 | } |
michael@0 | 564 | |
michael@0 | 565 | } |
michael@0 | 566 | |
michael@0 | 567 | ########################################################################### |
michael@0 | 568 | # |
michael@0 | 569 | # Codes for generating hkscs.ut and hkscs.uf files for Mozilla |
michael@0 | 570 | # |
michael@0 | 571 | sub gen_mozilla_uf() { |
michael@0 | 572 | # hkscs.uf |
michael@0 | 573 | foreach $unicode ( sort keys %u2b ) { |
michael@0 | 574 | $big5 = $u2b{$unicode}; |
michael@0 | 575 | my $b = hex($big5); |
michael@0 | 576 | print "0x", uc($big5), "\t0x", uc($unicode), "\n" |
michael@0 | 577 | unless ( $b >= 0xA140 and $b <= 0xC6A0 ) |
michael@0 | 578 | or ( $b >= 0xC940 and $b <= 0xF9D5 ) |
michael@0 | 579 | or ( $b < 0x8140 ) |
michael@0 | 580 | or ( hex($unicode) > 0xFFFF ); |
michael@0 | 581 | } |
michael@0 | 582 | } |
michael@0 | 583 | |
michael@0 | 584 | sub gen_mozilla_ut() { |
michael@0 | 585 | # hkscs.ut |
michael@0 | 586 | foreach $big5 ( sort keys %b2u ) { |
michael@0 | 587 | my $b = hex($big5); |
michael@0 | 588 | print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n" |
michael@0 | 589 | unless ( $b >= 0xA140 and $b <= 0xC6A0 ) |
michael@0 | 590 | or ( $b < 0x8140 ) |
michael@0 | 591 | or ( $b >= 0xC940 and $b <= 0xF9D5 ); |
michael@0 | 592 | } |
michael@0 | 593 | } |
michael@0 | 594 | |
michael@0 | 595 | |
michael@0 | 596 | ########################################################################### |
michael@0 | 597 | |
michael@0 | 598 | sub gen_glibc() { |
michael@0 | 599 | |
michael@0 | 600 | ########################################################################## |
michael@0 | 601 | # |
michael@0 | 602 | # Generate index for UCS4 to Big5-HKSCS conversion table |
michael@0 | 603 | # |
michael@0 | 604 | @index_array = (); |
michael@0 | 605 | |
michael@0 | 606 | $mode = 0; |
michael@0 | 607 | $count = 0; |
michael@0 | 608 | for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) { |
michael@0 | 609 | $unicode = sprintf( "%04X", $uni ); |
michael@0 | 610 | |
michael@0 | 611 | # print " /* U+$unicode */\t" if $low % 4 == 0; |
michael@0 | 612 | if ( defined( $u2b{$unicode} ) ) { |
michael@0 | 613 | if ( $mode == 0 ) { |
michael@0 | 614 | $range_start = $range_end = $uni; |
michael@0 | 615 | |
michael@0 | 616 | # printf " { %7s, ", sprintf("0x%04X", $range_start); |
michael@0 | 617 | $mode = 1; |
michael@0 | 618 | } |
michael@0 | 619 | else { |
michael@0 | 620 | $range_end = $uni; |
michael@0 | 621 | } |
michael@0 | 622 | } |
michael@0 | 623 | elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) { |
michael@0 | 624 | |
michael@0 | 625 | # Start a new range if the gap is 0x80 or larger |
michael@0 | 626 | # printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count; |
michael@0 | 627 | push @index_array, [ ( $range_start, $range_end, $count ) ]; |
michael@0 | 628 | $count += $range_end - $range_start + 1; |
michael@0 | 629 | $mode = 0; |
michael@0 | 630 | } |
michael@0 | 631 | } |
michael@0 | 632 | |
michael@0 | 633 | # |
michael@0 | 634 | # Note that $count and $range_end are used again as global variables |
michael@0 | 635 | # below |
michael@0 | 636 | # |
michael@0 | 637 | |
michael@0 | 638 | ########################################################################### |
michael@0 | 639 | # |
michael@0 | 640 | # Start generating real C code... |
michael@0 | 641 | # |
michael@0 | 642 | |
michael@0 | 643 | print <<'EOT'; |
michael@0 | 644 | /* Mapping tables for Big5-HKSCS handling. |
michael@0 | 645 | Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. |
michael@0 | 646 | This file is part of the GNU C Library. |
michael@0 | 647 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. |
michael@0 | 648 | Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000. |
michael@0 | 649 | Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn> |
michael@0 | 650 | and Anthony Fok <anthony@thizlinux.com>, 2002 |
michael@0 | 651 | |
michael@0 | 652 | The GNU C Library is free software; you can redistribute it and/or |
michael@0 | 653 | modify it under the terms of the GNU Lesser General Public |
michael@0 | 654 | License as published by the Free Software Foundation; either |
michael@0 | 655 | version 2.1 of the License, or (at your option) any later version. |
michael@0 | 656 | |
michael@0 | 657 | The GNU C Library is distributed in the hope that it will be useful, |
michael@0 | 658 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
michael@0 | 659 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
michael@0 | 660 | Lesser General Public License for more details. |
michael@0 | 661 | |
michael@0 | 662 | You should have received a copy of the GNU Lesser General Public |
michael@0 | 663 | License along with the GNU C Library; if not, write to the Free |
michael@0 | 664 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
michael@0 | 665 | 02111-1307 USA. */ |
michael@0 | 666 | |
michael@0 | 667 | #include <dlfcn.h> |
michael@0 | 668 | #include <gconv.h> |
michael@0 | 669 | #include <stdint.h> |
michael@0 | 670 | #include <stdlib.h> |
michael@0 | 671 | #include <string.h> |
michael@0 | 672 | #include <wchar.h> |
michael@0 | 673 | |
michael@0 | 674 | |
michael@0 | 675 | /* Table for Big5-HKSCS to UCS conversion. |
michael@0 | 676 | |
michael@0 | 677 | Original comments by Roger So when he updated the tables for HKSCS-1999: |
michael@0 | 678 | |
michael@0 | 679 | With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info: |
michael@0 | 680 | http://www.digital21.gov.hk/eng/hkscs/index.html |
michael@0 | 681 | - spacehunt 07/01/2000 |
michael@0 | 682 | |
michael@0 | 683 | The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt |
michael@0 | 684 | and big5cmp.txt using a Perl script while merging C source code from |
michael@0 | 685 | other developers. A copy of the source Perl script is available at: |
michael@0 | 686 | |
michael@0 | 687 | http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl |
michael@0 | 688 | http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl |
michael@0 | 689 | |
michael@0 | 690 | Revisions: |
michael@0 | 691 | 2001-10-30 made codec for Qt |
michael@0 | 692 | 2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001 |
michael@0 | 693 | |
michael@0 | 694 | Todo: |
michael@0 | 695 | Use a hash for characters beyond BMP to save space and make it |
michael@0 | 696 | more efficient |
michael@0 | 697 | |
michael@0 | 698 | - Anthony Fok <anthony@thizlinux.com> 21 Mar 2002 |
michael@0 | 699 | On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China |
michael@0 | 700 | */ |
michael@0 | 701 | |
michael@0 | 702 | EOT |
michael@0 | 703 | |
michael@0 | 704 | ########################################################################## |
michael@0 | 705 | # |
michael@0 | 706 | # Generate Big5-HKSCS to Unicode conversion table |
michael@0 | 707 | # |
michael@0 | 708 | |
michael@0 | 709 | ## print "Big5HKSCS to Unicode\n"; |
michael@0 | 710 | |
michael@0 | 711 | # for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) { |
michael@0 | 712 | |
michael@0 | 713 | $high_start = 0x88; |
michael@0 | 714 | $high_end = 0xfe; |
michael@0 | 715 | |
michael@0 | 716 | print "static const uint16_t big5_hkscs_to_ucs["; |
michael@0 | 717 | print( ( $high_end - $high_start + 1 ) * 157 ); |
michael@0 | 718 | print "] =\n{\n"; |
michael@0 | 719 | for $high ( 0x88 .. 0xfe ) { |
michael@0 | 720 | for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) { |
michael@0 | 721 | if ( $low == 0x40 ) { |
michael@0 | 722 | print "\n" unless $high == $high_start; |
michael@0 | 723 | printf |
michael@0 | 724 | "\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n", |
michael@0 | 725 | $high, $high, $high, $high; |
michael@0 | 726 | } |
michael@0 | 727 | elsif ( $low == 0xa1 ) { |
michael@0 | 728 | print "\t\t"; |
michael@0 | 729 | } |
michael@0 | 730 | $big5 = sprintf( "%02X%02X", $high, $low ); |
michael@0 | 731 | print "\t" if $low % 8 == 0; |
michael@0 | 732 | if ( defined( $b2u{$big5} ) ) { |
michael@0 | 733 | $unicode = $b2u{$big5}; |
michael@0 | 734 | print "0x", $unicode, ","; |
michael@0 | 735 | } |
michael@0 | 736 | else { |
michael@0 | 737 | print "0x0000,"; # for glibc |
michael@0 | 738 | } |
michael@0 | 739 | print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe ) |
michael@0 | 740 | ? "\n" |
michael@0 | 741 | : "\t" ); |
michael@0 | 742 | } |
michael@0 | 743 | } |
michael@0 | 744 | print "};\n\n"; |
michael@0 | 745 | |
michael@0 | 746 | ########################################################################## |
michael@0 | 747 | # |
michael@0 | 748 | # Generate Unicode to Big5-HKSCS conversion table |
michael@0 | 749 | # |
michael@0 | 750 | print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n"; |
michael@0 | 751 | foreach $index (@index_array) { |
michael@0 | 752 | ( $start, $end ) = ( @$index[0], @$index[1] ); |
michael@0 | 753 | printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 ); |
michael@0 | 754 | print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 ); |
michael@0 | 755 | for ( $i = $start ; $i <= $end ; $i++ ) { |
michael@0 | 756 | printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 ); |
michael@0 | 757 | $unicode = sprintf( "%04X", $i ); |
michael@0 | 758 | if ( defined( $big5 = $u2b{$unicode} ) ) { |
michael@0 | 759 | if ( $big5 =~ /^00/ ) { |
michael@0 | 760 | print '"\x', substr( $big5, 2, 2 ), '\x00",'; |
michael@0 | 761 | } |
michael@0 | 762 | else { |
michael@0 | 763 | print '"\x', substr( $big5, 0, 2 ), '\x', |
michael@0 | 764 | substr( $big5, 2, 2 ), '",'; |
michael@0 | 765 | } |
michael@0 | 766 | } |
michael@0 | 767 | else { |
michael@0 | 768 | print '"\x00\x00",'; |
michael@0 | 769 | } |
michael@0 | 770 | print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end; |
michael@0 | 771 | } |
michael@0 | 772 | print $end == $range_end ? "\n" : "\n\n"; |
michael@0 | 773 | } |
michael@0 | 774 | print "};\n\n"; |
michael@0 | 775 | |
michael@0 | 776 | ########################################################################### |
michael@0 | 777 | |
michael@0 | 778 | print <<EOT; |
michael@0 | 779 | static struct |
michael@0 | 780 | { |
michael@0 | 781 | /* Note: We are going to split this table so that we can use |
michael@0 | 782 | uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */ |
michael@0 | 783 | uint32_t from; |
michael@0 | 784 | uint32_t to; |
michael@0 | 785 | uint32_t offset; |
michael@0 | 786 | } from_ucs4_idx[] = |
michael@0 | 787 | { |
michael@0 | 788 | EOT |
michael@0 | 789 | foreach $index (@index_array) { |
michael@0 | 790 | printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ), |
michael@0 | 791 | sprintf( "0x%04X", @$index[1] ), @$index[2]; |
michael@0 | 792 | } |
michael@0 | 793 | print "};\n\n"; |
michael@0 | 794 | |
michael@0 | 795 | #foreach $i (sort keys %b2u) { |
michael@0 | 796 | # print $b2u{$i} . ' '; |
michael@0 | 797 | #} |
michael@0 | 798 | |
michael@0 | 799 | print <<'EOT'; |
michael@0 | 800 | /* Definitions used in the body of the `gconv' function. */ |
michael@0 | 801 | #define CHARSET_NAME "BIG5HKSCS//" |
michael@0 | 802 | #define FROM_LOOP from_big5 |
michael@0 | 803 | #define TO_LOOP to_big5 |
michael@0 | 804 | #define DEFINE_INIT 1 |
michael@0 | 805 | #define DEFINE_FINI 1 |
michael@0 | 806 | #define MIN_NEEDED_FROM 1 |
michael@0 | 807 | #define MAX_NEEDED_FROM 2 |
michael@0 | 808 | #define MIN_NEEDED_TO 4 |
michael@0 | 809 | |
michael@0 | 810 | |
michael@0 | 811 | /* First define the conversion function from Big5-HKSCS to UCS4. */ |
michael@0 | 812 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
michael@0 | 813 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
michael@0 | 814 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
michael@0 | 815 | #define LOOPFCT FROM_LOOP |
michael@0 | 816 | #define BODY \ |
michael@0 | 817 | { \ |
michael@0 | 818 | uint32_t ch = *inptr; \ |
michael@0 | 819 | \ |
michael@0 | 820 | if (ch >= 0x81 && ch <= 0xfe) \ |
michael@0 | 821 | { \ |
michael@0 | 822 | /* Two-byte character. First test whether the next character \ |
michael@0 | 823 | is also available. */ \ |
michael@0 | 824 | uint32_t ch2; \ |
michael@0 | 825 | int idx; \ |
michael@0 | 826 | \ |
michael@0 | 827 | if (__builtin_expect (inptr + 1 >= inend, 0)) \ |
michael@0 | 828 | { \ |
michael@0 | 829 | /* The second character is not available. */ \ |
michael@0 | 830 | result = __GCONV_INCOMPLETE_INPUT; \ |
michael@0 | 831 | break; \ |
michael@0 | 832 | } \ |
michael@0 | 833 | \ |
michael@0 | 834 | ch2 = inptr[1]; \ |
michael@0 | 835 | /* See whether the second byte is in the correct range. */ \ |
michael@0 | 836 | if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \ |
michael@0 | 837 | { \ |
michael@0 | 838 | if (ch >= 0x88) \ |
michael@0 | 839 | { \ |
michael@0 | 840 | /* Look up the table */ \ |
michael@0 | 841 | idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \ |
michael@0 | 842 | if ((ch = big5_hkscs_to_ucs[idx]) == 0) \ |
michael@0 | 843 | { \ |
michael@0 | 844 | /* This is illegal. */ \ |
michael@0 | 845 | if (! ignore_errors_p ()) \ |
michael@0 | 846 | { \ |
michael@0 | 847 | result = __GCONV_ILLEGAL_INPUT; \ |
michael@0 | 848 | break; \ |
michael@0 | 849 | } \ |
michael@0 | 850 | \ |
michael@0 | 851 | ++inptr; \ |
michael@0 | 852 | ++*irreversible; \ |
michael@0 | 853 | continue; \ |
michael@0 | 854 | } \ |
michael@0 | 855 | } \ |
michael@0 | 856 | else \ |
michael@0 | 857 | { \ |
michael@0 | 858 | /* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \ |
michael@0 | 859 | ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \ |
michael@0 | 860 | + 0xeeb8; \ |
michael@0 | 861 | } \ |
michael@0 | 862 | } \ |
michael@0 | 863 | else \ |
michael@0 | 864 | { \ |
michael@0 | 865 | /* This is illegal. */ \ |
michael@0 | 866 | if (! ignore_errors_p ()) \ |
michael@0 | 867 | { \ |
michael@0 | 868 | result = __GCONV_ILLEGAL_INPUT; \ |
michael@0 | 869 | break; \ |
michael@0 | 870 | } \ |
michael@0 | 871 | \ |
michael@0 | 872 | ++inptr; \ |
michael@0 | 873 | ++*irreversible; \ |
michael@0 | 874 | continue; \ |
michael@0 | 875 | } \ |
michael@0 | 876 | \ |
michael@0 | 877 | inptr += 2; \ |
michael@0 | 878 | } \ |
michael@0 | 879 | else if (__builtin_expect (ch, 0) == 0xff) \ |
michael@0 | 880 | { \ |
michael@0 | 881 | result = __GCONV_ILLEGAL_INPUT; \ |
michael@0 | 882 | break; \ |
michael@0 | 883 | } \ |
michael@0 | 884 | else /* 0x00 to 0x80 */ \ |
michael@0 | 885 | ++inptr; \ |
michael@0 | 886 | \ |
michael@0 | 887 | put32 (outptr, ch); \ |
michael@0 | 888 | outptr += 4; \ |
michael@0 | 889 | } |
michael@0 | 890 | #define LOOP_NEED_FLAGS |
michael@0 | 891 | #include <iconv/loop.c> |
michael@0 | 892 | |
michael@0 | 893 | |
michael@0 | 894 | /* Next, define the other direction. */ |
michael@0 | 895 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
michael@0 | 896 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
michael@0 | 897 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM |
michael@0 | 898 | #define LOOPFCT TO_LOOP |
michael@0 | 899 | #define BODY \ |
michael@0 | 900 | { \ |
michael@0 | 901 | uint32_t ch = get32 (inptr); \ |
michael@0 | 902 | const unsigned char *cp = ""; \ |
michael@0 | 903 | unsigned char b5ch[2] = "\0\0"; \ |
michael@0 | 904 | int i; \ |
michael@0 | 905 | \ |
michael@0 | 906 | for (i = 0; \ |
michael@0 | 907 | i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \ |
michael@0 | 908 | ++i) \ |
michael@0 | 909 | { \ |
michael@0 | 910 | if (ch < from_ucs4_idx[i].from) \ |
michael@0 | 911 | break; \ |
michael@0 | 912 | if (from_ucs4_idx[i].to >= ch) \ |
michael@0 | 913 | { \ |
michael@0 | 914 | cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \ |
michael@0 | 915 | + ch - from_ucs4_idx[i].from]; \ |
michael@0 | 916 | break; \ |
michael@0 | 917 | } \ |
michael@0 | 918 | } \ |
michael@0 | 919 | \ |
michael@0 | 920 | if (ch <= 0x80) \ |
michael@0 | 921 | { \ |
michael@0 | 922 | b5ch[0] = ch; \ |
michael@0 | 923 | cp = b5ch; \ |
michael@0 | 924 | } \ |
michael@0 | 925 | \ |
michael@0 | 926 | if (cp[0] == '\0' && ch != 0) \ |
michael@0 | 927 | { \ |
michael@0 | 928 | UNICODE_TAG_HANDLER (ch, 4); \ |
michael@0 | 929 | \ |
michael@0 | 930 | /* Illegal character. */ \ |
michael@0 | 931 | STANDARD_ERR_HANDLER (4); \ |
michael@0 | 932 | } \ |
michael@0 | 933 | else \ |
michael@0 | 934 | { \ |
michael@0 | 935 | /* See whether there is enough room for the second byte we write. */ \ |
michael@0 | 936 | if (__builtin_expect (cp[1], '\1') != '\0' \ |
michael@0 | 937 | && __builtin_expect (outptr + 1 >= outend, 0)) \ |
michael@0 | 938 | { \ |
michael@0 | 939 | /* We have not enough room. */ \ |
michael@0 | 940 | result = __GCONV_FULL_OUTPUT; \ |
michael@0 | 941 | break; \ |
michael@0 | 942 | } \ |
michael@0 | 943 | \ |
michael@0 | 944 | *outptr++ = cp[0]; \ |
michael@0 | 945 | if (cp[1] != '\0') \ |
michael@0 | 946 | *outptr++ = cp[1]; \ |
michael@0 | 947 | } \ |
michael@0 | 948 | \ |
michael@0 | 949 | inptr += 4; \ |
michael@0 | 950 | } |
michael@0 | 951 | #define LOOP_NEED_FLAGS |
michael@0 | 952 | #include <iconv/loop.c> |
michael@0 | 953 | |
michael@0 | 954 | |
michael@0 | 955 | /* Now define the toplevel functions. */ |
michael@0 | 956 | #include <iconv/skeleton.c> |
michael@0 | 957 | EOT |
michael@0 | 958 | |
michael@0 | 959 | } |