1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/tools/jamap.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,340 @@ 1.4 +#!/usr/local/bin/perl 1.5 +use strict; 1.6 + 1.7 +my @source_files; 1.8 + 1.9 +my @sjis_h; 1.10 +$sjis_h[0] = -1; 1.11 +@sjis_h[0x81..0x9f] = map { 0x2100 + $_ * 0x200 } (0 .. 30); 1.12 +@sjis_h[0xe0..0xef] = map { 0x5F00 + $_ * 0x200 } (0 .. 15); 1.13 +@sjis_h[0xf0..0xf9] = (-2) x 10; 1.14 +my @sjis_l; 1.15 +@sjis_l[0x40..0x7e] = (0x21..0x5f); 1.16 +@sjis_l[0x80..0xfc] = (0x60..0x7e, 0x121..0x17e); 1.17 + 1.18 +sub sjis_to_jis { 1.19 + my ($s) = @_; 1.20 + my $j; 1.21 + my $type; 1.22 + 1.23 + my $h = $sjis_h[($s>>8)&0xff]; 1.24 + 1.25 + if ( $h > 0 ) { # jis0208 1.26 + 1.27 + my $l = $sjis_l[$s&0xff]; 1.28 + if ( $l == 0 ) { 1.29 + $j = $s; 1.30 + $type = 'sjis2undef'; 1.31 + } else { 1.32 + $j = $h + $l; 1.33 + if ( $j >= 0x3000 && $j < 0x7500 ) { # jis0208 kanji 1.34 + $type = 'jis0208'; 1.35 + } elsif ( $j < 0x2900 ) { # jis0208 1.36 + $type = 'jis0208'; 1.37 + } else { 1.38 + $type = 'jis0208undef'; 1.39 + } 1.40 + } 1.41 + 1.42 + } elsif ( $h == -1 ) { # single byte 1.43 + 1.44 + $j = $s; 1.45 + if ( $s <= 0x7f ) { # jis0201 roman 1.46 + $type = 'jis0201'; 1.47 + } elsif ( $s >= 0xa1 && $s <= 0xdf ) { # jis0201 kana 1.48 + $type = 'jis0201'; 1.49 + } else { # sjis single byte undefined 1.50 + $type = 'sjis1undef'; 1.51 + } 1.52 + 1.53 + } elsif ( $h == -2 ) { # private use 1.54 + $j = $s; 1.55 + $type = 'private'; 1.56 + 1.57 + } else { # sjis undefined 1.58 + $j = $s; 1.59 + $type = 'sjis2undef'; 1.60 + } 1.61 + 1.62 + return ($j, $type); 1.63 +} 1.64 + 1.65 + 1.66 +sub read_sjis_map { 1.67 + my ($filename, $s_col, $u_col) = @_; 1.68 + my %map; 1.69 + open MAP, $filename or die $!; 1.70 + while (<MAP>) { 1.71 + my @cols = split /\s+/; 1.72 + my ($s, $u) = @cols[$s_col, $u_col]; 1.73 + $s =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next; 1.74 + 1.75 + $s = oct($s); 1.76 + $u = oct($u); 1.77 + 1.78 + my ($j, $type) = sjis_to_jis($s); 1.79 + push @{$map{$type}}, [$j, $s, $u]; 1.80 + 1.81 + } 1.82 + close MAP or warn $!; 1.83 + push @source_files, $filename; 1.84 + return %map; 1.85 +} 1.86 + 1.87 +sub read_0212_map { 1.88 + my ($filename, $j_col, $u_col) = @_; 1.89 + my $map; 1.90 + open MAP, $filename or die $!; 1.91 + while (<MAP>) { 1.92 + my @cols = split /\s+/; 1.93 + my ($j, $u) = @cols[$j_col, $u_col]; 1.94 + $j =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next; 1.95 + 1.96 + $j = oct($j); 1.97 + $u = oct($u); 1.98 + $u = 0xff5e if $u == 0x007e; 1.99 + 1.100 + push @$map, [$j, 0, $u]; 1.101 + } 1.102 + close MAP or warn $!; 1.103 + push @source_files, $filename; 1.104 + return $map; 1.105 +} 1.106 + 1.107 + 1.108 +my %printed; 1.109 +sub write_fromu_map { 1.110 + my ($filename, $code, @maps) = @_; 1.111 + open MAP, ">$filename" or die $!; 1.112 + foreach my $map (@maps) { 1.113 + foreach my $pair (@$map) { 1.114 + my ($j, $s, $u) = @$pair; 1.115 + if ( $code eq 'sjis' ) { 1.116 + $j = $s; 1.117 + } 1.118 + if ( defined($printed{$u}) ) { 1.119 + if ( $printed{$u} ne $j ) { 1.120 + printf "conflict 0x%04x to 0x%04x, 0x%04x\n", $u, $printed{$u}, $j; 1.121 + } 1.122 + } else { 1.123 + if ( $j < 0x100 ) { 1.124 + printf MAP "0x%02X\t0x%04X\n", $j, $u; 1.125 + } else { 1.126 + printf MAP "0x%04X\t0x%04X\n", $j, $u; 1.127 + } 1.128 + $printed{$u} = $j; 1.129 + } 1.130 + } 1.131 + } 1.132 + close MAP or warn $!; 1.133 +} 1.134 + 1.135 +my @table; 1.136 +my %table; 1.137 +my $table_next_count = 0; 1.138 + 1.139 +sub get_94table_index { 1.140 + my ($map_table) = @_; 1.141 + my $key = join ',', map {int($map_table->[$_])} (0 .. 93); 1.142 + my $table_index = $table{$key}; 1.143 + if ( !defined($table_index) ) { 1.144 + $table_index = $table_next_count; 1.145 + $table_next_count += 94; 1.146 + $table[$table_index] = $map_table; 1.147 + $table{$key} = $table_index; 1.148 + } 1.149 + return $table_index; 1.150 +} 1.151 + 1.152 +sub get_188table_index { 1.153 + my ($map_table) = @_; 1.154 + my $map_table1 = [ @{$map_table}[0 .. 93] ]; 1.155 + my $map_table2 = [ @{$map_table}[94 .. 187] ]; 1.156 + my $key = join ',', map {int($map_table->[$_])} (0 .. 187); 1.157 + my $key1 = join ',', map {int($map_table1->[$_])} (0 .. 93); 1.158 + my $key2 = join ',', map {int($map_table2->[$_])} (0 .. 93); 1.159 + my $table_index = $table{$key}; 1.160 + if ( !defined($table_index) ) { 1.161 + $table_index = $table_next_count; 1.162 + $table_next_count += 188; 1.163 + $table[$table_index] = $map_table1; 1.164 + $table[$table_index + 94] = $map_table2; 1.165 + $table{$key} = $table_index; 1.166 + $table{$key1} = $table_index unless defined($table{$key1}); 1.167 + $table{$key2} = $table_index + 94 unless defined($table{$key2}); 1.168 + } 1.169 + return $table_index; 1.170 +} 1.171 + 1.172 +get_188table_index([]); 1.173 + 1.174 +sub print_sjis_table_index { 1.175 + my @maps = @_; 1.176 + my %map_table; 1.177 + foreach my $map (@maps) { 1.178 + foreach my $pair (@$map) { 1.179 + my ($j, $s, $u) = @$pair; 1.180 + my $row = $s >> 8; 1.181 + my $cell = $s&0xff; 1.182 + if ( $cell >= 0x40 && $cell <= 0x7e ) { 1.183 + $cell -= 0x40; 1.184 + } elsif ( $cell >= 0x80 && $cell <= 0xfc ) { 1.185 + $cell -= 0x41; 1.186 + } else { 1.187 + next; 1.188 + } 1.189 + if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) { 1.190 + print "conflict!\n"; 1.191 + } 1.192 + $map_table{$row}->[$cell] = $u; 1.193 + } 1.194 + } 1.195 + 1.196 + for ( my $i = 0x80; $i < 0x100; $i++ ) { 1.197 + if ( ($i & 0x7) == 0 ) { 1.198 + print MAP "\n "; 1.199 + } 1.200 + if ( $i >= 0xa1 && $i <= 0xdf ) { 1.201 + printf MAP " 0x%04X,", $i + 0xfec0; 1.202 + } elsif ( $i >= 0xf0 && $i <= 0xf9 ) { 1.203 + printf MAP " 0x%04X,", 0xe000 + ($i - 0xf0) * 188; 1.204 + } elsif ( $i == 0x80 ) { 1.205 + print MAP " 0xFFFD,"; 1.206 + } elsif ( $i == 0xa0 ) { 1.207 + print MAP " 0xF8F0,"; 1.208 + } elsif ( $i >= 0xfd ) { 1.209 + printf MAP " 0x%04X,", $i + (0xf8f1 - 0xfd); 1.210 + } else { 1.211 + my $table_index = get_188table_index($map_table{$i}); 1.212 + printf MAP " %6d,", $table_index; 1.213 + } 1.214 + } 1.215 +} 1.216 + 1.217 +sub print_jis_table_index { 1.218 + my @maps = @_; 1.219 + my %map_table; 1.220 + foreach my $map (@maps) { 1.221 + foreach my $pair (@$map) { 1.222 + my ($j, $s, $u) = @$pair; 1.223 + my $row = $j >> 8; 1.224 + my $cell = ($j&0xff) - 0x21; 1.225 + if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) { 1.226 + print "conflict!\n"; 1.227 + } 1.228 + $map_table{$row}->[$cell] = $u; 1.229 + } 1.230 + } 1.231 + 1.232 + for ( my $i = 0; $i < 0x80; $i++ ) { 1.233 + if ( ($i & 0x7) == 0 ) { 1.234 + print MAP "\n "; 1.235 + } 1.236 + if ( $i >= 0x21 && $i <= 0x7e ) { 1.237 + my $table_index = get_94table_index($map_table{$i}); 1.238 + printf MAP " %6d,", $table_index; 1.239 + } else { 1.240 + print MAP " 0xFFFD,"; 1.241 + } 1.242 + } 1.243 +} 1.244 + 1.245 +sub print_table_index { 1.246 + my ($map_name, @maps) = @_; 1.247 + print MAP "static const uint16_t g${map_name}IndexShiftJis[] = {"; 1.248 + print_sjis_table_index(@maps); 1.249 + print MAP "\n};\n"; 1.250 + print MAP "static const uint16_t g${map_name}IndexJis0208[] = {"; 1.251 + print_jis_table_index(@maps); 1.252 + print MAP "\n};\n"; 1.253 + print MAP "static const uint16_t * const g${map_name}Index[] = {"; 1.254 + print MAP "\n g${map_name}IndexShiftJis, g${map_name}IndexJis0208"; 1.255 + print MAP "\n};\n\n"; 1.256 +} 1.257 + 1.258 +sub print_0212_table_index { 1.259 + my ($map_name, @maps) = @_; 1.260 + print MAP "static const uint16_t g${map_name}Index[] = {"; 1.261 + print_jis_table_index(@maps); 1.262 + print MAP "\n};\n\n"; 1.263 +} 1.264 + 1.265 + 1.266 +sub print_table { 1.267 + print MAP "static const uint16_t gJapaneseMap[] = {"; 1.268 + for ( my $i = 0; $i < $table_next_count; $i += 94 ) { 1.269 + my $index = $i; 1.270 + print MAP "\n /* index $index */\n "; 1.271 + my $map_table = $table[$i]; 1.272 + my $print_count = 1; 1.273 + for ( my $j = 0; $j < 94; $j++ ) { 1.274 + my $u = $map_table->[$j]; 1.275 + if ( $u == 0 ) { $u = 0xfffd; } 1.276 + printf MAP " 0x%04X,", $u; 1.277 + if ( ++$print_count == 8 ) { 1.278 + print MAP "\n "; 1.279 + $print_count = 0; 1.280 + } 1.281 + } 1.282 + } 1.283 + print MAP "\n};\n"; 1.284 +} 1.285 + 1.286 + 1.287 +my %cp932 = read_sjis_map('CP932.TXT', 0, 1); 1.288 +my %ibm = read_sjis_map('IBM943.TXT', 0, 1); 1.289 +my $jis0212 = read_0212_map('JIS0212.TXT', 0, 1); 1.290 + 1.291 +%printed = (); 1.292 +write_fromu_map('jis0201-uf-unify', 'jis', 1.293 + $cp932{jis0201}, 1.294 + $ibm{jis0201} 1.295 +); 1.296 +write_fromu_map('jis0208-uf-unify', 'jis', 1.297 + $cp932{jis0208}, 1.298 + $ibm{jis0208} 1.299 +); 1.300 + 1.301 +%printed = (); 1.302 +write_fromu_map('jis0208ext-uf-unify', 'jis', 1.303 + $cp932{jis0208undef}, 1.304 + $ibm{jis0208undef} 1.305 +); 1.306 + 1.307 +%printed = (); 1.308 +write_fromu_map('sjis-uf-unify', 'sjis', 1.309 + @cp932{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'}, 1.310 + @ibm{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'} 1.311 +); 1.312 + 1.313 +open MAP, ">japanese.map" or die $!; 1.314 +binmode MAP; 1.315 + 1.316 +while (<DATA>) { 1.317 + if ( /^!/ ) { last; } 1.318 + print MAP; 1.319 +} 1.320 +print MAP "/* generated by jamap.pl @source_files */\n\n"; 1.321 +print MAP <<EOM; 1.322 +// IE-compatible handling of undefined codepoints: 1.323 +// 0x80 --> U+0080 1.324 +// 0xa0 --> U+F8F0 1.325 +// 0xfd --> U+F8F1 1.326 +// 0xfe --> U+F8F2 1.327 +// 0xff --> U+F8F3 1.328 +EOM 1.329 + 1.330 +print_table_index('CP932', @cp932{'jis0208', 'jis0208undef', 'sjis2undef'}); 1.331 +print_table_index('IBM943', @ibm{'jis0208', 'jis0208undef', 'sjis2undef'}); 1.332 +print_0212_table_index('JIS0212', $jis0212); 1.333 +print_table(); 1.334 + 1.335 +close MAP or warn $!; 1.336 + 1.337 +__DATA__ 1.338 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.339 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.340 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.341 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.342 + 1.343 +!