intl/uconv/tools/mkjpconv.pl

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rwxr-xr-x

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 #!/usr/bin/perl
michael@0 2 $ID = "mkjpconv.pl @ARGV (Time-stamp: <2001-08-08 18:54:54 shom>)";
michael@0 3
michael@0 4 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 5 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 7
michael@0 8 #
michael@0 9 # based on CP932.TXT from unicode.org
michael@0 10 # additional information from SHIFTJIS.TXT from unicode.org
michael@0 11 #
michael@0 12 # mapping policy:
michael@0 13 # jis0208 to unicode : based on CP932
michael@0 14 # unicode to jis0208 : based on CP932
michael@0 15 # the lowest code is used for dual mapping to jis0208
michael@0 16 # ascii region : based on ISO8859-1 ( same as CP932 ) IGNORE?
michael@0 17 # kana region : based on CP932
michael@0 18 # IBM Ext(0xFxxx>) : premap to NEC region ( mappable to JIS )
michael@0 19
michael@0 20 if ($ARGV[0] eq "") {
michael@0 21 print STDERR "usage: mkjpconv.pl SHIFTJIS.TXT <INFILE(ex:CP932.TXT)> [Another check]\n";
michael@0 22 exit 1;
michael@0 23 }
michael@0 24
michael@0 25 open (SI, "SHIFTJIS.TXT") || die;
michael@0 26 while(<SI>) {
michael@0 27 ($hi,$lo) = /^0x(..)?(..)\s/;
michael@0 28 if ($lo eq "") { next; }
michael@0 29 if ($hi eq "") { $hi=" " }
michael@0 30 $defined{"0x$hi$lo"} = 1;
michael@0 31 }
michael@0 32 close (SI);
michael@0 33
michael@0 34 shift(@ARGV);
michael@0 35
michael@0 36 $src = $ARGV[0];
michael@0 37
michael@0 38 $gendir = "$src.d";
michael@0 39 mkdir("$src.d");
michael@0 40
michael@0 41 $sufile = "sjis2ucs-$src.map";
michael@0 42 $usfile = "ucs2sjis-$src.map";
michael@0 43 $jufile = "jis2ucs-$src.map";
michael@0 44 $jeufile = "jisext2ucs-$src.map";
michael@0 45 $jaufile = "jisasc2ucs-$src.map";
michael@0 46 $jrkufile = "jiskana2ucs-$src.map";
michael@0 47 $ujfile = "ucs2jis-$src.map";
michael@0 48 $ujefile = "ucs2jisext-$src.map";
michael@0 49 $ujafile = "ucs2jisasc-$src.map";
michael@0 50 $ujrkfile = "ucs2jiskana-$src.map";
michael@0 51 $ibmnecfile = "$gendir/IBMNEC.map";
michael@0 52 $jdxfile = "$gendir/jis0208.ump";
michael@0 53 $jdxextfile = "jis0208ext.ump";
michael@0 54 $commentfile = "comment-$src.txt";
michael@0 55
michael@0 56 open (IN, "NPL.header") || die;
michael@0 57 while(<IN>) {
michael@0 58 $NPL .= $_;
michael@0 59 }
michael@0 60 close (IN);
michael@0 61
michael@0 62 foreach $infile ( @ARGV ) {
michael@0 63
michael@0 64 open (IN, "$infile") || die;
michael@0 65
michael@0 66 while(<IN>) {
michael@0 67 ($from, $to, $seq, $dum, $comment) =
michael@0 68 /^\s*(0x[0-9a-fA-F]+)\s+(0x[0-9a-fA-F]+)(\+0x\S+)?(\s+\#\s*(\S.*))?$/;
michael@0 69 if ( $seq ne "" ) {
michael@0 70 print "Warning: Unicode Seq:\t$from\t$to$seq\t# $comment\n";
michael@0 71 }
michael@0 72
michael@0 73 if ( $from eq "" ) { next; }
michael@0 74
michael@0 75 if ( $from =~ /0x(..)$/ ) {
michael@0 76 $from = " 0x$1";
michael@0 77 }
michael@0 78
michael@0 79 if ( $fromto{$from} eq "" ) {
michael@0 80 push(@fromlist, $from);
michael@0 81 $fromto{$from} = $to;
michael@0 82 $commentbody{$from} = $comment;
michael@0 83 $commentseq{$from} = $seq
michael@0 84 } elsif ( $fromto{$from} ne $to ) {
michael@0 85 # another mappint SJIS:UCS2 = 1:N
michael@0 86 print "Another map in $infile\t$from\t$fromto{$from},$to\n";
michael@0 87 }
michael@0 88
michael@0 89 if ($checkanother==1) {
michael@0 90 next;
michael@0 91 }
michael@0 92
michael@0 93 if ( $tofrom{$to} eq "" ) {
michael@0 94 $tofrom{$to} = $from;
michael@0 95 } else {
michael@0 96 if ( $from !~ /$tofrom{$to}/ ){
michael@0 97 $tofrom{$to} = "$tofrom{$to},$from";
michael@0 98 }
michael@0 99 }
michael@0 100
michael@0 101 # print "$from $to\n";
michael@0 102 }
michael@0 103
michael@0 104 close (IN);
michael@0 105
michael@0 106 $checkanother == 1;
michael@0 107 }
michael@0 108
michael@0 109 open (COMMENT, ">$commentfile") || die;
michael@0 110 foreach $from (sort(@fromlist)) {
michael@0 111 print COMMENT "$from\t$fromto{$from}$commentseq{$from}\t$commentbody{$from}\n";
michael@0 112 }
michael@0 113 close (COMMENT);
michael@0 114
michael@0 115
michael@0 116 open(SU, ">$sufile") || die;
michael@0 117 open(US, ">$usfile") || die;
michael@0 118 open(JU, ">$jufile") || die;
michael@0 119 open(JEU, ">$jeufile") || die;
michael@0 120 open(JAU, ">$jaufile") || die;
michael@0 121 open(JRKU, ">$jrkufile") || die;
michael@0 122 open(UJ, ">$ujfile") || die;
michael@0 123 open(UJE, ">$ujefile") || die;
michael@0 124 open(UJA, ">$ujafile") || die;
michael@0 125 open(UJRK, ">$ujrkfile") || die;
michael@0 126 open(IBMNEC, ">$ibmnecfile") || die;
michael@0 127
michael@0 128 # print SU "/* generated from $src : SJIS UCS2 */\n";
michael@0 129 # print US "/* generated from $src : UCS2 SJIS */\n";
michael@0 130 print "Generated from $src\n";
michael@0 131 print "Command: mkjpconv.pl @ARGV\n";
michael@0 132 print "SJIS(JIS)\tUCS2\tSJIS\tS:U:S\tSJIS lower\n";
michael@0 133
michael@0 134 foreach $i (sort(@fromlist)) {
michael@0 135
michael@0 136 $ucs = "";
michael@0 137
michael@0 138 $sjis = $i;
michael@0 139 $sjis =~ s/\s+//;
michael@0 140 $jis = sjistojis($sjis);
michael@0 141
michael@0 142 print "$i($jis)\t$fromto{$i}\t$tofrom{$fromto{$i}}";
michael@0 143 $ucs = $fromto{$i};
michael@0 144
michael@0 145 if ( $i eq $tofrom{$fromto{$i}} ) {
michael@0 146 print "\t1:1:1";
michael@0 147 print "\t$i";
michael@0 148 } else {
michael@0 149 print "\t1:1:N";
michael@0 150 @tolist = split(/,/,$tofrom{$fromto{$i}});
michael@0 151 print "\t$tolist[0]";
michael@0 152 #$ucs = $tolist[0];
michael@0 153 if ( $sjis =~ /0xF[A-D]../ ) {
michael@0 154 $ibmnec{$sjis} = $tolist[0];
michael@0 155 #print IBMNEC "$sjis\t$tolist[0]\n";
michael@0 156 }
michael@0 157
michael@0 158 }
michael@0 159 print SU "$sjis\t$ucs\n";
michael@0 160 push(@uslist, "$ucs\t$sjis\n");
michael@0 161
michael@0 162 #print US "$ucs\t$sjis\n";
michael@0 163 if ( $jis ne "") {
michael@0 164 #if ($sjis =~ /^0x87../ || $sjis =~ /^0xED../ ) {
michael@0 165 # cp932 ext
michael@0 166 if ($sjis =~ /0x..../ && $defined{$sjis} != 1) {
michael@0 167 # jis not define
michael@0 168 print JEU "$jis\t$ucs\n";
michael@0 169 push(@ujelist, "$ucs\t$jis\n");
michael@0 170 $jisextucs{$jis} = $ucs;
michael@0 171 } else {
michael@0 172 print JU "$jis\t$ucs\n";
michael@0 173 push(@ujlist, "$ucs\t$jis\n");
michael@0 174 $jisucs{$jis} = $ucs;
michael@0 175 }
michael@0 176
michael@0 177 #print UJ "$ucs\t$jis\n";
michael@0 178 } elsif ( $sjis =~ /\s*0x([8-9A-D].)/ ) {
michael@0 179 $code = $1;
michael@0 180 print JRKU "0x00$code\t$ucs\n";
michael@0 181 push(@ujrklist, "$ucs\t0x00$code\n");
michael@0 182 } elsif ( $sjis =~ /\s*0x([0-7].)/ ) {
michael@0 183 $code = $1;
michael@0 184 print JAU "0x00$code\t$ucs\n";
michael@0 185 push(@ujalist, "$ucs\t0x00$code\n");
michael@0 186 }
michael@0 187 #print "\t# $comment{$i}\n";
michael@0 188 print "\n";
michael@0 189 }
michael@0 190
michael@0 191 print US sort(@uslist);
michael@0 192 print UJ sort(@ujlist);
michael@0 193 print UJE sort(@ujelist);
michael@0 194 print UJA sort(@ujalist);
michael@0 195 print UJRK sort(@ujrklist);
michael@0 196
michael@0 197 # make ibmnec mapping
michael@0 198
michael@0 199 print IBMNEC $NPL;
michael@0 200 print IBMNEC "/* generated by $ID */\n";
michael@0 201 print IBMNEC "/* IBM ext codes to NEC sel (in CP932) */\n\n";
michael@0 202
michael@0 203 foreach $i (0xFA, 0xFB, 0xFC) {
michael@0 204 for ($j=( ($i==0xFA) ? 0x40 : 0x00 ); $j<=0xFF; $j++) {
michael@0 205 $ibm = sprintf("0x%02X%02X", $i, $j);
michael@0 206 $raw = substr($ibm, 2,6);
michael@0 207 if ("" == $ibmnec{$ibm}) {
michael@0 208 print IBMNEC "/* $raw:UNDEF */ 0, \n";
michael@0 209 } else {
michael@0 210 print IBMNEC "/* $raw */ $ibmnec{$ibm}, \n";
michael@0 211 }
michael@0 212 }
michael@0 213 }
michael@0 214
michael@0 215 close(IBMNEC);
michael@0 216
michael@0 217 # make jdx
michael@0 218
michael@0 219 open (JDX, ">$jdxfile") || die;
michael@0 220
michael@0 221 print JDX $NPL;
michael@0 222 print JDX "/* generated by $ID */\n";
michael@0 223 print JDX "/* JIS X 0208 (with CP932 ext) to Unicode mapping */\n";
michael@0 224
michael@0 225 for ($i=0; $i<94; $i++) {
michael@0 226 printf JDX "/* 0x%2XXX */\n", ($i+0x21);
michael@0 227 printf JDX " ";
michael@0 228 for ($j=0; $j<94; $j++) {
michael@0 229 $jis = sprintf("0x%02X%02X", ($i+0x21), $j+0x21);
michael@0 230 # get JIS
michael@0 231 $ucs = $jisucs{$jis};
michael@0 232 if ("" == $ucs) {
michael@0 233 # try CP932 ext
michael@0 234 # try jis ext
michael@0 235 $ucs = $jisextucs{$jis}
michael@0 236 }
michael@0 237 if ("" == $ucs) {
michael@0 238 # undefined
michael@0 239 print JDX "0xFFFD,";
michael@0 240 } else {
michael@0 241 print JDX "$ucs,";
michael@0 242 }
michael@0 243 if (7 == ( ($j+1) % 8 )) {
michael@0 244 printf JDX "/* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8;
michael@0 245 }
michael@0 246 }
michael@0 247 printf JDX " /* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8;
michael@0 248 }
michael@0 249
michael@0 250 close (JDX);
michael@0 251
michael@0 252
michael@0 253 close(SU);
michael@0 254 close(US);
michael@0 255 close(JU);
michael@0 256 close(JEU);
michael@0 257 close(JAU);
michael@0 258 close(JRKU);
michael@0 259 close(UJ);
michael@0 260 close(UJE);
michael@0 261 close(UJA);
michael@0 262 close(UJRK);
michael@0 263
michael@0 264 # generate uf files
michael@0 265
michael@0 266 sub genuf {
michael@0 267 my ($infile, $outfile) = @_;
michael@0 268 my $com = "cat $infile | ./umaptable -uf > $gendir/$outfile";
michael@0 269 print "Executing $com\n";
michael@0 270 system($com);
michael@0 271 }
michael@0 272
michael@0 273 genuf($sufile, "sjis.uf");
michael@0 274 genuf($jufile, "jis0208.uf");
michael@0 275 if ( $#ujelist > 0 ) {
michael@0 276 genuf($jeufile, "jis0208ext.uf");
michael@0 277 } else {
michael@0 278 print "Extension is not found. jis0208ext.uf is not generated.\n";
michael@0 279 }
michael@0 280 genuf("$jaufile $jrkufile", "jis0201.uf");
michael@0 281 # genuf($jaufile, "jis0201.uf");
michael@0 282 # genuf($jrkufile, "jis0201gl.uf");
michael@0 283
michael@0 284
michael@0 285 # generate test page
michael@0 286
michael@0 287
michael@0 288 exit;
michael@0 289
michael@0 290 sub sjistojis {
michael@0 291 my($sjis) = (@_);
michael@0 292 my($first,$second,$h, $l, $j0208);
michael@0 293
michael@0 294 if ( $sjis !~ /^0x....$/ ) {
michael@0 295 return "";
michael@0 296 }
michael@0 297
michael@0 298 $first = hex(substr($sjis,2,2));
michael@0 299 $second = hex(substr($sjis,4,2));
michael@0 300 $jnum=0;
michael@0 301
michael@0 302 if($first < 0xE0)
michael@0 303 {
michael@0 304 $jnum = ($first - 0x81) * ((0xfd - 0x80)+(0x7f - 0x40));
michael@0 305 } else {
michael@0 306 $jnum = ($first - 0xe0 + (0xa0-0x81)) * ((0xfd - 0x80)+(0x7f - 0x40));
michael@0 307 }
michael@0 308 if($second >= 0x80)
michael@0 309 {
michael@0 310 $jnum += $second - 0x80 + (0x7f-0x40);
michael@0 311 }
michael@0 312 else
michael@0 313 {
michael@0 314 $jnum += $second - 0x40;
michael@0 315 }
michael@0 316 if(($jnum / 94 ) < 94) {
michael@0 317 return sprintf "0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21);
michael@0 318 } else {
michael@0 319 #return sprintf "# 0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21);
michael@0 320 return "";
michael@0 321 }
michael@0 322 }
michael@0 323

mercurial