intl/uconv/tools/jamap.pl

branch
TOR_BUG_9701
changeset 8
97036ab72558
equal deleted inserted replaced
-1:000000000000 0:bf395f70021e
1 #!/usr/local/bin/perl
2 use strict;
3
4 my @source_files;
5
6 my @sjis_h;
7 $sjis_h[0] = -1;
8 @sjis_h[0x81..0x9f] = map { 0x2100 + $_ * 0x200 } (0 .. 30);
9 @sjis_h[0xe0..0xef] = map { 0x5F00 + $_ * 0x200 } (0 .. 15);
10 @sjis_h[0xf0..0xf9] = (-2) x 10;
11 my @sjis_l;
12 @sjis_l[0x40..0x7e] = (0x21..0x5f);
13 @sjis_l[0x80..0xfc] = (0x60..0x7e, 0x121..0x17e);
14
15 sub sjis_to_jis {
16 my ($s) = @_;
17 my $j;
18 my $type;
19
20 my $h = $sjis_h[($s>>8)&0xff];
21
22 if ( $h > 0 ) { # jis0208
23
24 my $l = $sjis_l[$s&0xff];
25 if ( $l == 0 ) {
26 $j = $s;
27 $type = 'sjis2undef';
28 } else {
29 $j = $h + $l;
30 if ( $j >= 0x3000 && $j < 0x7500 ) { # jis0208 kanji
31 $type = 'jis0208';
32 } elsif ( $j < 0x2900 ) { # jis0208
33 $type = 'jis0208';
34 } else {
35 $type = 'jis0208undef';
36 }
37 }
38
39 } elsif ( $h == -1 ) { # single byte
40
41 $j = $s;
42 if ( $s <= 0x7f ) { # jis0201 roman
43 $type = 'jis0201';
44 } elsif ( $s >= 0xa1 && $s <= 0xdf ) { # jis0201 kana
45 $type = 'jis0201';
46 } else { # sjis single byte undefined
47 $type = 'sjis1undef';
48 }
49
50 } elsif ( $h == -2 ) { # private use
51 $j = $s;
52 $type = 'private';
53
54 } else { # sjis undefined
55 $j = $s;
56 $type = 'sjis2undef';
57 }
58
59 return ($j, $type);
60 }
61
62
63 sub read_sjis_map {
64 my ($filename, $s_col, $u_col) = @_;
65 my %map;
66 open MAP, $filename or die $!;
67 while (<MAP>) {
68 my @cols = split /\s+/;
69 my ($s, $u) = @cols[$s_col, $u_col];
70 $s =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
71
72 $s = oct($s);
73 $u = oct($u);
74
75 my ($j, $type) = sjis_to_jis($s);
76 push @{$map{$type}}, [$j, $s, $u];
77
78 }
79 close MAP or warn $!;
80 push @source_files, $filename;
81 return %map;
82 }
83
84 sub read_0212_map {
85 my ($filename, $j_col, $u_col) = @_;
86 my $map;
87 open MAP, $filename or die $!;
88 while (<MAP>) {
89 my @cols = split /\s+/;
90 my ($j, $u) = @cols[$j_col, $u_col];
91 $j =~ /^0x[0-9A-Fa-f]+$/ && $u =~ /^0x[0-9A-Fa-f]+$/ or next;
92
93 $j = oct($j);
94 $u = oct($u);
95 $u = 0xff5e if $u == 0x007e;
96
97 push @$map, [$j, 0, $u];
98 }
99 close MAP or warn $!;
100 push @source_files, $filename;
101 return $map;
102 }
103
104
105 my %printed;
106 sub write_fromu_map {
107 my ($filename, $code, @maps) = @_;
108 open MAP, ">$filename" or die $!;
109 foreach my $map (@maps) {
110 foreach my $pair (@$map) {
111 my ($j, $s, $u) = @$pair;
112 if ( $code eq 'sjis' ) {
113 $j = $s;
114 }
115 if ( defined($printed{$u}) ) {
116 if ( $printed{$u} ne $j ) {
117 printf "conflict 0x%04x to 0x%04x, 0x%04x\n", $u, $printed{$u}, $j;
118 }
119 } else {
120 if ( $j < 0x100 ) {
121 printf MAP "0x%02X\t0x%04X\n", $j, $u;
122 } else {
123 printf MAP "0x%04X\t0x%04X\n", $j, $u;
124 }
125 $printed{$u} = $j;
126 }
127 }
128 }
129 close MAP or warn $!;
130 }
131
132 my @table;
133 my %table;
134 my $table_next_count = 0;
135
136 sub get_94table_index {
137 my ($map_table) = @_;
138 my $key = join ',', map {int($map_table->[$_])} (0 .. 93);
139 my $table_index = $table{$key};
140 if ( !defined($table_index) ) {
141 $table_index = $table_next_count;
142 $table_next_count += 94;
143 $table[$table_index] = $map_table;
144 $table{$key} = $table_index;
145 }
146 return $table_index;
147 }
148
149 sub get_188table_index {
150 my ($map_table) = @_;
151 my $map_table1 = [ @{$map_table}[0 .. 93] ];
152 my $map_table2 = [ @{$map_table}[94 .. 187] ];
153 my $key = join ',', map {int($map_table->[$_])} (0 .. 187);
154 my $key1 = join ',', map {int($map_table1->[$_])} (0 .. 93);
155 my $key2 = join ',', map {int($map_table2->[$_])} (0 .. 93);
156 my $table_index = $table{$key};
157 if ( !defined($table_index) ) {
158 $table_index = $table_next_count;
159 $table_next_count += 188;
160 $table[$table_index] = $map_table1;
161 $table[$table_index + 94] = $map_table2;
162 $table{$key} = $table_index;
163 $table{$key1} = $table_index unless defined($table{$key1});
164 $table{$key2} = $table_index + 94 unless defined($table{$key2});
165 }
166 return $table_index;
167 }
168
169 get_188table_index([]);
170
171 sub print_sjis_table_index {
172 my @maps = @_;
173 my %map_table;
174 foreach my $map (@maps) {
175 foreach my $pair (@$map) {
176 my ($j, $s, $u) = @$pair;
177 my $row = $s >> 8;
178 my $cell = $s&0xff;
179 if ( $cell >= 0x40 && $cell <= 0x7e ) {
180 $cell -= 0x40;
181 } elsif ( $cell >= 0x80 && $cell <= 0xfc ) {
182 $cell -= 0x41;
183 } else {
184 next;
185 }
186 if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
187 print "conflict!\n";
188 }
189 $map_table{$row}->[$cell] = $u;
190 }
191 }
192
193 for ( my $i = 0x80; $i < 0x100; $i++ ) {
194 if ( ($i & 0x7) == 0 ) {
195 print MAP "\n ";
196 }
197 if ( $i >= 0xa1 && $i <= 0xdf ) {
198 printf MAP " 0x%04X,", $i + 0xfec0;
199 } elsif ( $i >= 0xf0 && $i <= 0xf9 ) {
200 printf MAP " 0x%04X,", 0xe000 + ($i - 0xf0) * 188;
201 } elsif ( $i == 0x80 ) {
202 print MAP " 0xFFFD,";
203 } elsif ( $i == 0xa0 ) {
204 print MAP " 0xF8F0,";
205 } elsif ( $i >= 0xfd ) {
206 printf MAP " 0x%04X,", $i + (0xf8f1 - 0xfd);
207 } else {
208 my $table_index = get_188table_index($map_table{$i});
209 printf MAP " %6d,", $table_index;
210 }
211 }
212 }
213
214 sub print_jis_table_index {
215 my @maps = @_;
216 my %map_table;
217 foreach my $map (@maps) {
218 foreach my $pair (@$map) {
219 my ($j, $s, $u) = @$pair;
220 my $row = $j >> 8;
221 my $cell = ($j&0xff) - 0x21;
222 if ( defined($map_table{$row}->[$cell]) && $map_table{$row}->[$cell] != $u ) {
223 print "conflict!\n";
224 }
225 $map_table{$row}->[$cell] = $u;
226 }
227 }
228
229 for ( my $i = 0; $i < 0x80; $i++ ) {
230 if ( ($i & 0x7) == 0 ) {
231 print MAP "\n ";
232 }
233 if ( $i >= 0x21 && $i <= 0x7e ) {
234 my $table_index = get_94table_index($map_table{$i});
235 printf MAP " %6d,", $table_index;
236 } else {
237 print MAP " 0xFFFD,";
238 }
239 }
240 }
241
242 sub print_table_index {
243 my ($map_name, @maps) = @_;
244 print MAP "static const uint16_t g${map_name}IndexShiftJis[] = {";
245 print_sjis_table_index(@maps);
246 print MAP "\n};\n";
247 print MAP "static const uint16_t g${map_name}IndexJis0208[] = {";
248 print_jis_table_index(@maps);
249 print MAP "\n};\n";
250 print MAP "static const uint16_t * const g${map_name}Index[] = {";
251 print MAP "\n g${map_name}IndexShiftJis, g${map_name}IndexJis0208";
252 print MAP "\n};\n\n";
253 }
254
255 sub print_0212_table_index {
256 my ($map_name, @maps) = @_;
257 print MAP "static const uint16_t g${map_name}Index[] = {";
258 print_jis_table_index(@maps);
259 print MAP "\n};\n\n";
260 }
261
262
263 sub print_table {
264 print MAP "static const uint16_t gJapaneseMap[] = {";
265 for ( my $i = 0; $i < $table_next_count; $i += 94 ) {
266 my $index = $i;
267 print MAP "\n /* index $index */\n ";
268 my $map_table = $table[$i];
269 my $print_count = 1;
270 for ( my $j = 0; $j < 94; $j++ ) {
271 my $u = $map_table->[$j];
272 if ( $u == 0 ) { $u = 0xfffd; }
273 printf MAP " 0x%04X,", $u;
274 if ( ++$print_count == 8 ) {
275 print MAP "\n ";
276 $print_count = 0;
277 }
278 }
279 }
280 print MAP "\n};\n";
281 }
282
283
284 my %cp932 = read_sjis_map('CP932.TXT', 0, 1);
285 my %ibm = read_sjis_map('IBM943.TXT', 0, 1);
286 my $jis0212 = read_0212_map('JIS0212.TXT', 0, 1);
287
288 %printed = ();
289 write_fromu_map('jis0201-uf-unify', 'jis',
290 $cp932{jis0201},
291 $ibm{jis0201}
292 );
293 write_fromu_map('jis0208-uf-unify', 'jis',
294 $cp932{jis0208},
295 $ibm{jis0208}
296 );
297
298 %printed = ();
299 write_fromu_map('jis0208ext-uf-unify', 'jis',
300 $cp932{jis0208undef},
301 $ibm{jis0208undef}
302 );
303
304 %printed = ();
305 write_fromu_map('sjis-uf-unify', 'sjis',
306 @cp932{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'},
307 @ibm{'jis0201', 'jis0208', 'jis0208undef', 'sjis1undef', 'sjis2undef'}
308 );
309
310 open MAP, ">japanese.map" or die $!;
311 binmode MAP;
312
313 while (<DATA>) {
314 if ( /^!/ ) { last; }
315 print MAP;
316 }
317 print MAP "/* generated by jamap.pl @source_files */\n\n";
318 print MAP <<EOM;
319 // IE-compatible handling of undefined codepoints:
320 // 0x80 --> U+0080
321 // 0xa0 --> U+F8F0
322 // 0xfd --> U+F8F1
323 // 0xfe --> U+F8F2
324 // 0xff --> U+F8F3
325 EOM
326
327 print_table_index('CP932', @cp932{'jis0208', 'jis0208undef', 'sjis2undef'});
328 print_table_index('IBM943', @ibm{'jis0208', 'jis0208undef', 'sjis2undef'});
329 print_0212_table_index('JIS0212', $jis0212);
330 print_table();
331
332 close MAP or warn $!;
333
334 __DATA__
335 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
336 /* This Source Code Form is subject to the terms of the Mozilla Public
337 * License, v. 2.0. If a copy of the MPL was not distributed with this
338 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
339
340 !

mercurial