intl/uconv/tools/jamap.pl

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 #!/usr/local/bin/perl
michael@0 2 use strict;
michael@0 3
michael@0 4 my @source_files;
michael@0 5
michael@0 6 my @sjis_h;
michael@0 7 $sjis_h[0] = -1;
michael@0 8 @sjis_h[0x81..0x9f] = map { 0x2100 + $_ * 0x200 } (0 .. 30);
michael@0 9 @sjis_h[0xe0..0xef] = map { 0x5F00 + $_ * 0x200 } (0 .. 15);
michael@0 10 @sjis_h[0xf0..0xf9] = (-2) x 10;
michael@0 11 my @sjis_l;
michael@0 12 @sjis_l[0x40..0x7e] = (0x21..0x5f);
michael@0 13 @sjis_l[0x80..0xfc] = (0x60..0x7e, 0x121..0x17e);
michael@0 14
michael@0 15 sub sjis_to_jis {
michael@0 16 my ($s) = @_;
michael@0 17 my $j;
michael@0 18 my $type;
michael@0 19
michael@0 20 my $h = $sjis_h[($s>>8)&0xff];
michael@0 21
michael@0 22 if ( $h > 0 ) { # jis0208
michael@0 23
michael@0 24 my $l = $sjis_l[$s&0xff];
michael@0 25 if ( $l == 0 ) {
michael@0 26 $j = $s;
michael@0 27 $type = 'sjis2undef';
michael@0 28 } else {
michael@0 29 $j = $h + $l;
michael@0 30 if ( $j >= 0x3000 && $j < 0x7500 ) { # jis0208 kanji
michael@0 31 $type = 'jis0208';
michael@0 32 } elsif ( $j < 0x2900 ) { # jis0208
michael@0 33 $type = 'jis0208';
michael@0 34 } else {
michael@0 35 $type = 'jis0208undef';
michael@0 36 }
michael@0 37 }
michael@0 38
michael@0 39 } elsif ( $h == -1 ) { # single byte
michael@0 40
michael@0 41 $j = $s;
michael@0 42 if ( $s <= 0x7f ) { # jis0201 roman
michael@0 43 $type = 'jis0201';
michael@0 44 } elsif ( $s >= 0xa1 && $s <= 0xdf ) { # jis0201 kana
michael@0 45 $type = 'jis0201';
michael@0 46 } else { # sjis single byte undefined
michael@0 47 $type = 'sjis1undef';
michael@0 48 }
michael@0 49
michael@0 50 } elsif ( $h == -2 ) { # private use
michael@0 51 $j = $s;
michael@0 52 $type = 'private';
michael@0 53
michael@0 54 } else { # sjis undefined
michael@0 55 $j = $s;
michael@0 56 $type = 'sjis2undef';
michael@0 57 }
michael@0 58
michael@0 59 return ($j, $type);
michael@0 60 }
michael@0 61
michael@0 62
michael@0 63 sub read_sjis_map {
michael@0 64 my ($filename, $s_col, $u_col) = @_;
michael@0 65 my %map;
michael@0 66 open MAP, $filename or die $!;
michael@0 67 while (<MAP>) {
michael@0 68 my @cols = split /\s+/;
michael@0 69 my ($s, $u) = @cols[$s_col, $u_col];
michael@0 70 $s =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
michael@0 71
michael@0 72 $s = oct($s);
michael@0 73 $u = oct($u);
michael@0 74
michael@0 75 my ($j, $type) = sjis_to_jis($s);
michael@0 76 push @{$map{$type}}, [$j, $s, $u];
michael@0 77
michael@0 78 }
michael@0 79 close MAP or warn $!;
michael@0 80 push @source_files, $filename;
michael@0 81 return %map;
michael@0 82 }
michael@0 83
michael@0 84 sub read_0212_map {
michael@0 85 my ($filename, $j_col, $u_col) = @_;
michael@0 86 my $map;
michael@0 87 open MAP, $filename or die $!;
michael@0 88 while (<MAP>) {
michael@0 89 my @cols = split /\s+/;
michael@0 90 my ($j, $u) = @cols[$j_col, $u_col];
michael@0 91 $j =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
michael@0 92
michael@0 93 $j = oct($j);
michael@0 94 $u = oct($u);
michael@0 95 $u = 0xff5e if $u == 0x007e;
michael@0 96
michael@0 97 push @$map, [$j, 0, $u];
michael@0 98 }
michael@0 99 close MAP or warn $!;
michael@0 100 push @source_files, $filename;
michael@0 101 return $map;
michael@0 102 }
michael@0 103
michael@0 104
michael@0 105 my %printed;
michael@0 106 sub write_fromu_map {
michael@0 107 my ($filename, $code, @maps) = @_;
michael@0 108 open MAP, ">$filename" or die $!;
michael@0 109 foreach my $map (@maps) {
michael@0 110 foreach my $pair (@$map) {
michael@0 111 my ($j, $s, $u) = @$pair;
michael@0 112 if ( $code eq 'sjis' ) {
michael@0 113 $j = $s;
michael@0 114 }
michael@0 115 if ( defined($printed{$u}) ) {
michael@0 116 if ( $printed{$u} ne $j ) {
michael@0 117 printf "conflict 0x%04x to 0x%04x, 0x%04x\n", $u, $printed{$u}, $j;
michael@0 118 }
michael@0 119 } else {
michael@0 120 if ( $j < 0x100 ) {
michael@0 121 printf MAP "0x%02X\t0x%04X\n", $j, $u;
michael@0 122 } else {
michael@0 123 printf MAP "0x%04X\t0x%04X\n", $j, $u;
michael@0 124 }
michael@0 125 $printed{$u} = $j;
michael@0 126 }
michael@0 127 }
michael@0 128 }
michael@0 129 close MAP or warn $!;
michael@0 130 }
michael@0 131
michael@0 132 my @table;
michael@0 133 my %table;
michael@0 134 my $table_next_count = 0;
michael@0 135
michael@0 136 sub get_94table_index {
michael@0 137 my ($map_table) = @_;
michael@0 138 my $key = join ',', map {int($map_table->[$_])} (0 .. 93);
michael@0 139 my $table_index = $table{$key};
michael@0 140 if ( !defined($table_index) ) {
michael@0 141 $table_index = $table_next_count;
michael@0 142 $table_next_count += 94;
michael@0 143 $table[$table_index] = $map_table;
michael@0 144 $table{$key} = $table_index;
michael@0 145 }
michael@0 146 return $table_index;
michael@0 147 }
michael@0 148
michael@0 149 sub get_188table_index {
michael@0 150 my ($map_table) = @_;
michael@0 151 my $map_table1 = [ @{$map_table}[0 .. 93] ];
michael@0 152 my $map_table2 = [ @{$map_table}[94 .. 187] ];
michael@0 153 my $key = join ',', map {int($map_table->[$_])} (0 .. 187);
michael@0 154 my $key1 = join ',', map {int($map_table1->[$_])} (0 .. 93);
michael@0 155 my $key2 = join ',', map {int($map_table2->[$_])} (0 .. 93);
michael@0 156 my $table_index = $table{$key};
michael@0 157 if ( !defined($table_index) ) {
michael@0 158 $table_index = $table_next_count;
michael@0 159 $table_next_count += 188;
michael@0 160 $table[$table_index] = $map_table1;
michael@0 161 $table[$table_index + 94] = $map_table2;
michael@0 162 $table{$key} = $table_index;
michael@0 163 $table{$key1} = $table_index unless defined($table{$key1});
michael@0 164 $table{$key2} = $table_index + 94 unless defined($table{$key2});
michael@0 165 }
michael@0 166 return $table_index;
michael@0 167 }
michael@0 168
michael@0 169 get_188table_index([]);
michael@0 170
michael@0 171 sub print_sjis_table_index {
michael@0 172 my @maps = @_;
michael@0 173 my %map_table;
michael@0 174 foreach my $map (@maps) {
michael@0 175 foreach my $pair (@$map) {
michael@0 176 my ($j, $s, $u) = @$pair;
michael@0 177 my $row = $s >> 8;
michael@0 178 my $cell = $s&0xff;
michael@0 179 if ( $cell >= 0x40 && $cell <= 0x7e ) {
michael@0 180 $cell -= 0x40;
michael@0 181 } elsif ( $cell >= 0x80 && $cell <= 0xfc ) {
michael@0 182 $cell -= 0x41;
michael@0 183 } else {
michael@0 184 next;
michael@0 185 }
michael@0 186 if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
michael@0 187 print "conflict!\n";
michael@0 188 }
michael@0 189 $map_table{$row}->[$cell] = $u;
michael@0 190 }
michael@0 191 }
michael@0 192
michael@0 193 for ( my $i = 0x80; $i < 0x100; $i++ ) {
michael@0 194 if ( ($i & 0x7) == 0 ) {
michael@0 195 print MAP "\n ";
michael@0 196 }
michael@0 197 if ( $i >= 0xa1 && $i <= 0xdf ) {
michael@0 198 printf MAP " 0x%04X,", $i + 0xfec0;
michael@0 199 } elsif ( $i >= 0xf0 && $i <= 0xf9 ) {
michael@0 200 printf MAP " 0x%04X,", 0xe000 + ($i - 0xf0) * 188;
michael@0 201 } elsif ( $i == 0x80 ) {
michael@0 202 print MAP " 0xFFFD,";
michael@0 203 } elsif ( $i == 0xa0 ) {
michael@0 204 print MAP " 0xF8F0,";
michael@0 205 } elsif ( $i >= 0xfd ) {
michael@0 206 printf MAP " 0x%04X,", $i + (0xf8f1 - 0xfd);
michael@0 207 } else {
michael@0 208 my $table_index = get_188table_index($map_table{$i});
michael@0 209 printf MAP " %6d,", $table_index;
michael@0 210 }
michael@0 211 }
michael@0 212 }
michael@0 213
michael@0 214 sub print_jis_table_index {
michael@0 215 my @maps = @_;
michael@0 216 my %map_table;
michael@0 217 foreach my $map (@maps) {
michael@0 218 foreach my $pair (@$map) {
michael@0 219 my ($j, $s, $u) = @$pair;
michael@0 220 my $row = $j >> 8;
michael@0 221 my $cell = ($j&0xff) - 0x21;
michael@0 222 if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
michael@0 223 print "conflict!\n";
michael@0 224 }
michael@0 225 $map_table{$row}->[$cell] = $u;
michael@0 226 }
michael@0 227 }
michael@0 228
michael@0 229 for ( my $i = 0; $i < 0x80; $i++ ) {
michael@0 230 if ( ($i & 0x7) == 0 ) {
michael@0 231 print MAP "\n ";
michael@0 232 }
michael@0 233 if ( $i >= 0x21 && $i <= 0x7e ) {
michael@0 234 my $table_index = get_94table_index($map_table{$i});
michael@0 235 printf MAP " %6d,", $table_index;
michael@0 236 } else {
michael@0 237 print MAP " 0xFFFD,";
michael@0 238 }
michael@0 239 }
michael@0 240 }
michael@0 241
michael@0 242 sub print_table_index {
michael@0 243 my ($map_name, @maps) = @_;
michael@0 244 print MAP "static const uint16_t g${map_name}IndexShiftJis[] = {";
michael@0 245 print_sjis_table_index(@maps);
michael@0 246 print MAP "\n};\n";
michael@0 247 print MAP "static const uint16_t g${map_name}IndexJis0208[] = {";
michael@0 248 print_jis_table_index(@maps);
michael@0 249 print MAP "\n};\n";
michael@0 250 print MAP "static const uint16_t * const g${map_name}Index[] = {";
michael@0 251 print MAP "\n g${map_name}IndexShiftJis, g${map_name}IndexJis0208";
michael@0 252 print MAP "\n};\n\n";
michael@0 253 }
michael@0 254
michael@0 255 sub print_0212_table_index {
michael@0 256 my ($map_name, @maps) = @_;
michael@0 257 print MAP "static const uint16_t g${map_name}Index[] = {";
michael@0 258 print_jis_table_index(@maps);
michael@0 259 print MAP "\n};\n\n";
michael@0 260 }
michael@0 261
michael@0 262
michael@0 263 sub print_table {
michael@0 264 print MAP "static const uint16_t gJapaneseMap[] = {";
michael@0 265 for ( my $i = 0; $i < $table_next_count; $i += 94 ) {
michael@0 266 my $index = $i;
michael@0 267 print MAP "\n /* index $index */\n ";
michael@0 268 my $map_table = $table[$i];
michael@0 269 my $print_count = 1;
michael@0 270 for ( my $j = 0; $j < 94; $j++ ) {
michael@0 271 my $u = $map_table->[$j];
michael@0 272 if ( $u == 0 ) { $u = 0xfffd; }
michael@0 273 printf MAP " 0x%04X,", $u;
michael@0 274 if ( ++$print_count == 8 ) {
michael@0 275 print MAP "\n ";
michael@0 276 $print_count = 0;
michael@0 277 }
michael@0 278 }
michael@0 279 }
michael@0 280 print MAP "\n};\n";
michael@0 281 }
michael@0 282
michael@0 283
michael@0 284 my %cp932 = read_sjis_map('CP932.TXT', 0, 1);
michael@0 285 my %ibm = read_sjis_map('IBM943.TXT', 0, 1);
michael@0 286 my $jis0212 = read_0212_map('JIS0212.TXT', 0, 1);
michael@0 287
michael@0 288 %printed = ();
michael@0 289 write_fromu_map('jis0201-uf-unify', 'jis',
michael@0 290 $cp932{jis0201},
michael@0 291 $ibm{jis0201}
michael@0 292 );
michael@0 293 write_fromu_map('jis0208-uf-unify', 'jis',
michael@0 294 $cp932{jis0208},
michael@0 295 $ibm{jis0208}
michael@0 296 );
michael@0 297
michael@0 298 %printed = ();
michael@0 299 write_fromu_map('jis0208ext-uf-unify', 'jis',
michael@0 300 $cp932{jis0208undef},
michael@0 301 $ibm{jis0208undef}
michael@0 302 );
michael@0 303
michael@0 304 %printed = ();
michael@0 305 write_fromu_map('sjis-uf-unify', 'sjis',
michael@0 306 @cp932{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'},
michael@0 307 @ibm{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'}
michael@0 308 );
michael@0 309
michael@0 310 open MAP, ">japanese.map" or die $!;
michael@0 311 binmode MAP;
michael@0 312
michael@0 313 while (<DATA>) {
michael@0 314 if ( /^!/ ) { last; }
michael@0 315 print MAP;
michael@0 316 }
michael@0 317 print MAP "/* generated by jamap.pl @source_files */\n\n";
michael@0 318 print MAP <<EOM;
michael@0 319 // IE-compatible handling of undefined codepoints:
michael@0 320 // 0x80 --> U+0080
michael@0 321 // 0xa0 --> U+F8F0
michael@0 322 // 0xfd --> U+F8F1
michael@0 323 // 0xfe --> U+F8F2
michael@0 324 // 0xff --> U+F8F3
michael@0 325 EOM
michael@0 326
michael@0 327 print_table_index('CP932', @cp932{'jis0208', 'jis0208undef', 'sjis2undef'});
michael@0 328 print_table_index('IBM943', @ibm{'jis0208', 'jis0208undef', 'sjis2undef'});
michael@0 329 print_0212_table_index('JIS0212', $jis0212);
michael@0 330 print_table();
michael@0 331
michael@0 332 close MAP or warn $!;
michael@0 333
michael@0 334 __DATA__
michael@0 335 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 336 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 337 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 338 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 339
michael@0 340 !

mercurial