intl/uconv/tools/jamap.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/tools/jamap.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,340 @@
     1.4 +#!/usr/local/bin/perl
     1.5 +use strict;
     1.6 +
     1.7 +my @source_files;
     1.8 +
     1.9 +my @sjis_h;
    1.10 +$sjis_h[0] = -1;
    1.11 +@sjis_h[0x81..0x9f] = map { 0x2100 + $_ * 0x200 } (0 .. 30);
    1.12 +@sjis_h[0xe0..0xef] = map { 0x5F00 + $_ * 0x200 } (0 .. 15);
    1.13 +@sjis_h[0xf0..0xf9] = (-2) x 10;
    1.14 +my @sjis_l;
    1.15 +@sjis_l[0x40..0x7e] = (0x21..0x5f);
    1.16 +@sjis_l[0x80..0xfc] = (0x60..0x7e, 0x121..0x17e);
    1.17 +
    1.18 +sub sjis_to_jis {
    1.19 +  my ($s) = @_;
    1.20 +  my $j;
    1.21 +  my $type;
    1.22 +
    1.23 +  my $h = $sjis_h[($s>>8)&0xff];
    1.24 +
    1.25 +  if ( $h > 0 ) { # jis0208
    1.26 +
    1.27 +    my $l = $sjis_l[$s&0xff];
    1.28 +    if ( $l == 0 ) {
    1.29 +      $j = $s;
    1.30 +      $type = 'sjis2undef';
    1.31 +    } else {
    1.32 +      $j = $h + $l;
    1.33 +      if ( $j >= 0x3000 && $j < 0x7500 ) { # jis0208 kanji
    1.34 +        $type = 'jis0208';
    1.35 +      } elsif ( $j < 0x2900 ) { # jis0208
    1.36 +        $type = 'jis0208';
    1.37 +      } else {
    1.38 +        $type = 'jis0208undef';
    1.39 +      }
    1.40 +    }
    1.41 +
    1.42 +  } elsif ( $h == -1 ) { # single byte
    1.43 +
    1.44 +    $j = $s;
    1.45 +    if ( $s <= 0x7f ) { # jis0201 roman
    1.46 +      $type = 'jis0201';
    1.47 +    } elsif ( $s >= 0xa1 && $s <= 0xdf ) { # jis0201 kana
    1.48 +      $type = 'jis0201';
    1.49 +    } else { # sjis single byte undefined
    1.50 +      $type = 'sjis1undef';
    1.51 +    }
    1.52 +
    1.53 +  } elsif ( $h == -2 ) { # private use
    1.54 +    $j = $s;
    1.55 +    $type = 'private';
    1.56 +
    1.57 +  } else { # sjis undefined
    1.58 +    $j = $s;
    1.59 +    $type = 'sjis2undef';
    1.60 +  }
    1.61 +
    1.62 +  return ($j, $type);
    1.63 +}
    1.64 +
    1.65 +
    1.66 +sub read_sjis_map {
    1.67 +  my ($filename, $s_col, $u_col) = @_;
    1.68 +  my %map;
    1.69 +  open MAP, $filename or die $!;
    1.70 +  while (<MAP>) {
    1.71 +    my @cols = split /\s+/;
    1.72 +    my ($s, $u) = @cols[$s_col, $u_col];
    1.73 +    $s =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
    1.74 +
    1.75 +    $s = oct($s);
    1.76 +    $u = oct($u);
    1.77 +
    1.78 +    my ($j, $type) = sjis_to_jis($s);
    1.79 +    push @{$map{$type}}, [$j, $s, $u];
    1.80 +
    1.81 +  }
    1.82 +  close MAP or warn $!;
    1.83 +  push @source_files, $filename;
    1.84 +  return %map;
    1.85 +}
    1.86 +
    1.87 +sub read_0212_map {
    1.88 +  my ($filename, $j_col, $u_col) = @_;
    1.89 +  my $map;
    1.90 +  open MAP, $filename or die $!;
    1.91 +  while (<MAP>) {
    1.92 +    my @cols = split /\s+/;
    1.93 +    my ($j, $u) = @cols[$j_col, $u_col];
    1.94 +    $j =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
    1.95 +
    1.96 +    $j = oct($j);
    1.97 +    $u = oct($u);
    1.98 +    $u = 0xff5e if $u == 0x007e;
    1.99 +
   1.100 +    push @$map, [$j, 0, $u];
   1.101 +  }
   1.102 +  close MAP or warn $!;
   1.103 +  push @source_files, $filename;
   1.104 +  return $map;
   1.105 +}
   1.106 +
   1.107 +
   1.108 +my %printed;
   1.109 +sub write_fromu_map {
   1.110 +  my ($filename, $code, @maps) = @_;
   1.111 +  open MAP, ">$filename" or die $!;
   1.112 +  foreach my $map (@maps) {
   1.113 +    foreach my $pair (@$map) {
   1.114 +      my ($j, $s, $u) = @$pair;
   1.115 +      if ( $code eq 'sjis' ) {
   1.116 +        $j = $s;
   1.117 +      }
   1.118 +      if ( defined($printed{$u}) ) {
   1.119 +        if ( $printed{$u} ne $j ) {
   1.120 +          printf "conflict 0x%04x to 0x%04x, 0x%04x\n", $u, $printed{$u}, $j;
   1.121 +        }
   1.122 +      } else {
   1.123 +        if ( $j < 0x100 ) {
   1.124 +          printf MAP "0x%02X\t0x%04X\n", $j, $u;
   1.125 +        } else {
   1.126 +          printf MAP "0x%04X\t0x%04X\n", $j, $u;
   1.127 +        }
   1.128 +        $printed{$u} = $j;
   1.129 +      }
   1.130 +    }
   1.131 +  }
   1.132 +  close MAP or warn $!;
   1.133 +}
   1.134 +
   1.135 +my @table;
   1.136 +my %table;
   1.137 +my $table_next_count = 0;
   1.138 +
   1.139 +sub get_94table_index {
   1.140 +  my ($map_table) = @_;
   1.141 +  my $key = join ',', map {int($map_table->[$_])} (0 .. 93);
   1.142 +  my $table_index = $table{$key};
   1.143 +  if ( !defined($table_index) ) {
   1.144 +    $table_index = $table_next_count;
   1.145 +    $table_next_count += 94;
   1.146 +    $table[$table_index] = $map_table;
   1.147 +    $table{$key} = $table_index;
   1.148 +  }
   1.149 +  return $table_index;
   1.150 +}
   1.151 +
   1.152 +sub get_188table_index {
   1.153 +  my ($map_table) = @_;
   1.154 +  my $map_table1 = [ @{$map_table}[0 .. 93] ];
   1.155 +  my $map_table2 = [ @{$map_table}[94 .. 187] ];
   1.156 +  my $key = join ',', map {int($map_table->[$_])} (0 .. 187);
   1.157 +  my $key1 = join ',', map {int($map_table1->[$_])} (0 .. 93);
   1.158 +  my $key2 = join ',', map {int($map_table2->[$_])} (0 .. 93);
   1.159 +  my $table_index = $table{$key};
   1.160 +  if ( !defined($table_index) ) {
   1.161 +    $table_index = $table_next_count;
   1.162 +    $table_next_count += 188;
   1.163 +    $table[$table_index] = $map_table1;
   1.164 +    $table[$table_index + 94] = $map_table2;
   1.165 +    $table{$key} = $table_index;
   1.166 +    $table{$key1} = $table_index unless defined($table{$key1});
   1.167 +    $table{$key2} = $table_index + 94 unless defined($table{$key2});
   1.168 +  }
   1.169 +  return $table_index;
   1.170 +}
   1.171 +
   1.172 +get_188table_index([]);
   1.173 +
   1.174 +sub print_sjis_table_index {
   1.175 +  my @maps = @_;
   1.176 +  my %map_table;
   1.177 +  foreach my $map (@maps) {
   1.178 +    foreach my $pair (@$map) {
   1.179 +      my ($j, $s, $u) = @$pair;
   1.180 +      my $row = $s >> 8;
   1.181 +      my $cell = $s&0xff;
   1.182 +      if ( $cell >= 0x40 && $cell <= 0x7e ) {
   1.183 +        $cell -= 0x40;
   1.184 +      } elsif ( $cell >= 0x80 && $cell <= 0xfc ) {
   1.185 +        $cell -= 0x41;
   1.186 +      } else {
   1.187 +        next;
   1.188 +      }
   1.189 +      if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
   1.190 +         print "conflict!\n";
   1.191 +      }
   1.192 +      $map_table{$row}->[$cell] = $u;
   1.193 +    }
   1.194 +  }
   1.195 +
   1.196 +  for ( my $i = 0x80; $i < 0x100; $i++ ) {
   1.197 +    if ( ($i & 0x7) == 0 ) {
   1.198 +      print MAP "\n ";
   1.199 +    }
   1.200 +    if ( $i >= 0xa1 && $i <= 0xdf ) {
   1.201 +      printf MAP " 0x%04X,", $i + 0xfec0;
   1.202 +    } elsif ( $i >= 0xf0 && $i <= 0xf9 ) {
   1.203 +      printf MAP " 0x%04X,", 0xe000 + ($i - 0xf0) * 188;
   1.204 +    } elsif ( $i == 0x80 ) {
   1.205 +      print MAP " 0xFFFD,";
   1.206 +    } elsif ( $i == 0xa0 ) {
   1.207 +      print MAP " 0xF8F0,";
   1.208 +    } elsif ( $i >= 0xfd ) {
   1.209 +      printf MAP " 0x%04X,", $i + (0xf8f1 - 0xfd);
   1.210 +    } else {
   1.211 +      my $table_index = get_188table_index($map_table{$i});
   1.212 +      printf MAP " %6d,", $table_index;
   1.213 +    }
   1.214 +  }
   1.215 +}
   1.216 +
   1.217 +sub print_jis_table_index {
   1.218 +  my @maps = @_;
   1.219 +  my %map_table;
   1.220 +  foreach my $map (@maps) {
   1.221 +    foreach my $pair (@$map) {
   1.222 +      my ($j, $s, $u) = @$pair;
   1.223 +      my $row = $j >> 8;
   1.224 +      my $cell = ($j&0xff) - 0x21;
   1.225 +      if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
   1.226 +         print "conflict!\n";
   1.227 +      }
   1.228 +      $map_table{$row}->[$cell] = $u;
   1.229 +    }
   1.230 +  }
   1.231 +
   1.232 +  for ( my $i = 0; $i < 0x80; $i++ ) {
   1.233 +    if ( ($i & 0x7) == 0 ) {
   1.234 +      print MAP "\n ";
   1.235 +    }
   1.236 +    if ( $i >= 0x21 && $i <= 0x7e ) {
   1.237 +      my $table_index = get_94table_index($map_table{$i});
   1.238 +      printf MAP " %6d,", $table_index;
   1.239 +    } else {
   1.240 +      print MAP " 0xFFFD,";
   1.241 +    }
   1.242 +  }
   1.243 +}
   1.244 +
   1.245 +sub print_table_index {
   1.246 +  my ($map_name, @maps) = @_;
   1.247 +  print MAP "static const uint16_t g${map_name}IndexShiftJis[] = {";
   1.248 +  print_sjis_table_index(@maps);
   1.249 +  print MAP "\n};\n";
   1.250 +  print MAP "static const uint16_t g${map_name}IndexJis0208[] = {";
   1.251 +  print_jis_table_index(@maps);
   1.252 +  print MAP "\n};\n";
   1.253 +  print MAP "static const uint16_t * const g${map_name}Index[] = {";
   1.254 +  print MAP "\n  g${map_name}IndexShiftJis, g${map_name}IndexJis0208";
   1.255 +  print MAP "\n};\n\n";
   1.256 +}
   1.257 +
   1.258 +sub print_0212_table_index {
   1.259 +  my ($map_name, @maps) = @_;
   1.260 +  print MAP "static const uint16_t g${map_name}Index[] = {";
   1.261 +  print_jis_table_index(@maps);
   1.262 +  print MAP "\n};\n\n";
   1.263 +}
   1.264 +
   1.265 +
   1.266 +sub print_table {
   1.267 +  print MAP "static const uint16_t gJapaneseMap[] = {";
   1.268 +  for ( my $i = 0; $i < $table_next_count; $i += 94 ) {
   1.269 +    my $index = $i;
   1.270 +    print MAP "\n  /* index $index */\n         ";
   1.271 +    my $map_table = $table[$i];
   1.272 +    my $print_count = 1;
   1.273 +    for ( my $j = 0; $j < 94; $j++ ) {
   1.274 +      my $u = $map_table->[$j];
   1.275 +      if ( $u == 0 ) { $u = 0xfffd; }
   1.276 +      printf MAP " 0x%04X,", $u;
   1.277 +      if ( ++$print_count == 8 ) {
   1.278 +        print MAP "\n ";
   1.279 +        $print_count = 0;
   1.280 +      }
   1.281 +    }
   1.282 +  }
   1.283 +  print MAP "\n};\n";
   1.284 +}
   1.285 +
   1.286 +
   1.287 +my %cp932 = read_sjis_map('CP932.TXT', 0, 1);
   1.288 +my %ibm = read_sjis_map('IBM943.TXT', 0, 1);
   1.289 +my $jis0212 = read_0212_map('JIS0212.TXT', 0, 1);
   1.290 +
   1.291 +%printed = ();
   1.292 +write_fromu_map('jis0201-uf-unify', 'jis',
   1.293 +  $cp932{jis0201},
   1.294 +  $ibm{jis0201}
   1.295 +);
   1.296 +write_fromu_map('jis0208-uf-unify', 'jis',
   1.297 +  $cp932{jis0208},
   1.298 +  $ibm{jis0208}
   1.299 +);
   1.300 +
   1.301 +%printed = ();
   1.302 +write_fromu_map('jis0208ext-uf-unify', 'jis',
   1.303 +  $cp932{jis0208undef},
   1.304 +  $ibm{jis0208undef}
   1.305 +);
   1.306 +
   1.307 +%printed = ();
   1.308 +write_fromu_map('sjis-uf-unify', 'sjis',
   1.309 +  @cp932{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'},
   1.310 +  @ibm{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'}
   1.311 +);
   1.312 +
   1.313 +open MAP, ">japanese.map" or die $!;
   1.314 +binmode MAP;
   1.315 +
   1.316 +while (<DATA>) {
   1.317 +  if ( /^!/ ) { last; }
   1.318 +  print MAP;
   1.319 +}
   1.320 +print MAP "/* generated by jamap.pl @source_files */\n\n";
   1.321 +print MAP <<EOM;
   1.322 +// IE-compatible handling of undefined codepoints:
   1.323 +// 0x80 --> U+0080
   1.324 +// 0xa0 --> U+F8F0
   1.325 +// 0xfd --> U+F8F1
   1.326 +// 0xfe --> U+F8F2
   1.327 +// 0xff --> U+F8F3
   1.328 +EOM
   1.329 +
   1.330 +print_table_index('CP932', @cp932{'jis0208', 'jis0208undef', 'sjis2undef'});
   1.331 +print_table_index('IBM943', @ibm{'jis0208', 'jis0208undef', 'sjis2undef'});
   1.332 +print_0212_table_index('JIS0212', $jis0212);
   1.333 +print_table();
   1.334 +
   1.335 +close MAP or warn $!;
   1.336 +
   1.337 +__DATA__
   1.338 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   1.339 +/* This Source Code Form is subject to the terms of the Mozilla Public
   1.340 + * License, v. 2.0. If a copy of the MPL was not distributed with this
   1.341 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   1.342 +
   1.343 +!

mercurial