1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/tools/mkjpconv.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,323 @@ 1.4 +#!/usr/bin/perl 1.5 +$ID = "mkjpconv.pl @ARGV (Time-stamp: <2001-08-08 18:54:54 shom>)"; 1.6 + 1.7 +# This Source Code Form is subject to the terms of the Mozilla Public 1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.10 + 1.11 +# 1.12 +# based on CP932.TXT from unicode.org 1.13 +# additional information from SHIFTJIS.TXT from unicode.org 1.14 +# 1.15 +# mapping policy: 1.16 +# jis0208 to unicode : based on CP932 1.17 +# unicode to jis0208 : based on CP932 1.18 +# the lowest code is used for dual mapping to jis0208 1.19 +# ascii region : based on ISO8859-1 ( same as CP932 ) IGNORE? 1.20 +# kana region : based on CP932 1.21 +# IBM Ext(0xFxxx>) : premap to NEC region ( mappable to JIS ) 1.22 + 1.23 +if ($ARGV[0] eq "") { 1.24 + print STDERR "usage: mkjpconv.pl SHIFTJIS.TXT <INFILE(ex:CP932.TXT)> [Another check]\n"; 1.25 + exit 1; 1.26 +} 1.27 + 1.28 +open (SI, "SHIFTJIS.TXT") || die; 1.29 +while(<SI>) { 1.30 + ($hi,$lo) = /^0x(..)?(..)\s/; 1.31 + if ($lo eq "") { next; } 1.32 + if ($hi eq "") { $hi=" " } 1.33 + $defined{"0x$hi$lo"} = 1; 1.34 +} 1.35 +close (SI); 1.36 + 1.37 +shift(@ARGV); 1.38 + 1.39 +$src = $ARGV[0]; 1.40 + 1.41 +$gendir = "$src.d"; 1.42 +mkdir("$src.d"); 1.43 + 1.44 +$sufile = "sjis2ucs-$src.map"; 1.45 +$usfile = "ucs2sjis-$src.map"; 1.46 +$jufile = "jis2ucs-$src.map"; 1.47 +$jeufile = "jisext2ucs-$src.map"; 1.48 +$jaufile = "jisasc2ucs-$src.map"; 1.49 +$jrkufile = "jiskana2ucs-$src.map"; 1.50 +$ujfile = "ucs2jis-$src.map"; 1.51 +$ujefile = "ucs2jisext-$src.map"; 1.52 +$ujafile = "ucs2jisasc-$src.map"; 1.53 +$ujrkfile = "ucs2jiskana-$src.map"; 1.54 +$ibmnecfile = "$gendir/IBMNEC.map"; 1.55 +$jdxfile = "$gendir/jis0208.ump"; 1.56 +$jdxextfile = "jis0208ext.ump"; 1.57 +$commentfile = "comment-$src.txt"; 1.58 + 1.59 +open (IN, "NPL.header") || die; 1.60 +while(<IN>) { 1.61 + $NPL .= $_; 1.62 +} 1.63 +close (IN); 1.64 + 1.65 +foreach $infile ( @ARGV ) { 1.66 + 1.67 + open (IN, "$infile") || die; 1.68 + 1.69 + while(<IN>) { 1.70 + ($from, $to, $seq, $dum, $comment) = 1.71 + /^\s*(0x[0-9a-fA-F]+)\s+(0x[0-9a-fA-F]+)(\+0x\S+)?(\s+\#\s*(\S.*))?$/; 1.72 + if ( $seq ne "" ) { 1.73 + print "Warning: Unicode Seq:\t$from\t$to$seq\t# $comment\n"; 1.74 + } 1.75 + 1.76 + if ( $from eq "" ) { next; } 1.77 + 1.78 + if ( $from =~ /0x(..)$/ ) { 1.79 + $from = " 0x$1"; 1.80 + } 1.81 + 1.82 + if ( $fromto{$from} eq "" ) { 1.83 + push(@fromlist, $from); 1.84 + $fromto{$from} = $to; 1.85 + $commentbody{$from} = $comment; 1.86 + $commentseq{$from} = $seq 1.87 + } elsif ( $fromto{$from} ne $to ) { 1.88 + # another mappint SJIS:UCS2 = 1:N 1.89 + print "Another map in $infile\t$from\t$fromto{$from},$to\n"; 1.90 + } 1.91 + 1.92 + if ($checkanother==1) { 1.93 + next; 1.94 + } 1.95 + 1.96 + if ( $tofrom{$to} eq "" ) { 1.97 + $tofrom{$to} = $from; 1.98 + } else { 1.99 + if ( $from !~ /$tofrom{$to}/ ){ 1.100 + $tofrom{$to} = "$tofrom{$to},$from"; 1.101 + } 1.102 + } 1.103 + 1.104 + # print "$from $to\n"; 1.105 + } 1.106 + 1.107 + close (IN); 1.108 + 1.109 + $checkanother == 1; 1.110 +} 1.111 + 1.112 +open (COMMENT, ">$commentfile") || die; 1.113 +foreach $from (sort(@fromlist)) { 1.114 + print COMMENT "$from\t$fromto{$from}$commentseq{$from}\t$commentbody{$from}\n"; 1.115 +} 1.116 +close (COMMENT); 1.117 + 1.118 + 1.119 +open(SU, ">$sufile") || die; 1.120 +open(US, ">$usfile") || die; 1.121 +open(JU, ">$jufile") || die; 1.122 +open(JEU, ">$jeufile") || die; 1.123 +open(JAU, ">$jaufile") || die; 1.124 +open(JRKU, ">$jrkufile") || die; 1.125 +open(UJ, ">$ujfile") || die; 1.126 +open(UJE, ">$ujefile") || die; 1.127 +open(UJA, ">$ujafile") || die; 1.128 +open(UJRK, ">$ujrkfile") || die; 1.129 +open(IBMNEC, ">$ibmnecfile") || die; 1.130 + 1.131 +# print SU "/* generated from $src : SJIS UCS2 */\n"; 1.132 +# print US "/* generated from $src : UCS2 SJIS */\n"; 1.133 +print "Generated from $src\n"; 1.134 +print "Command: mkjpconv.pl @ARGV\n"; 1.135 +print "SJIS(JIS)\tUCS2\tSJIS\tS:U:S\tSJIS lower\n"; 1.136 + 1.137 +foreach $i (sort(@fromlist)) { 1.138 + 1.139 + $ucs = ""; 1.140 + 1.141 + $sjis = $i; 1.142 + $sjis =~ s/\s+//; 1.143 + $jis = sjistojis($sjis); 1.144 + 1.145 + print "$i($jis)\t$fromto{$i}\t$tofrom{$fromto{$i}}"; 1.146 + $ucs = $fromto{$i}; 1.147 + 1.148 + if ( $i eq $tofrom{$fromto{$i}} ) { 1.149 + print "\t1:1:1"; 1.150 + print "\t$i"; 1.151 + } else { 1.152 + print "\t1:1:N"; 1.153 + @tolist = split(/,/,$tofrom{$fromto{$i}}); 1.154 + print "\t$tolist[0]"; 1.155 + #$ucs = $tolist[0]; 1.156 + if ( $sjis =~ /0xF[A-D]../ ) { 1.157 + $ibmnec{$sjis} = $tolist[0]; 1.158 + #print IBMNEC "$sjis\t$tolist[0]\n"; 1.159 + } 1.160 + 1.161 + } 1.162 + print SU "$sjis\t$ucs\n"; 1.163 + push(@uslist, "$ucs\t$sjis\n"); 1.164 + 1.165 + #print US "$ucs\t$sjis\n"; 1.166 + if ( $jis ne "") { 1.167 + #if ($sjis =~ /^0x87../ || $sjis =~ /^0xED../ ) { 1.168 + # cp932 ext 1.169 + if ($sjis =~ /0x..../ && $defined{$sjis} != 1) { 1.170 + # jis not define 1.171 + print JEU "$jis\t$ucs\n"; 1.172 + push(@ujelist, "$ucs\t$jis\n"); 1.173 + $jisextucs{$jis} = $ucs; 1.174 + } else { 1.175 + print JU "$jis\t$ucs\n"; 1.176 + push(@ujlist, "$ucs\t$jis\n"); 1.177 + $jisucs{$jis} = $ucs; 1.178 + } 1.179 + 1.180 + #print UJ "$ucs\t$jis\n"; 1.181 + } elsif ( $sjis =~ /\s*0x([8-9A-D].)/ ) { 1.182 + $code = $1; 1.183 + print JRKU "0x00$code\t$ucs\n"; 1.184 + push(@ujrklist, "$ucs\t0x00$code\n"); 1.185 + } elsif ( $sjis =~ /\s*0x([0-7].)/ ) { 1.186 + $code = $1; 1.187 + print JAU "0x00$code\t$ucs\n"; 1.188 + push(@ujalist, "$ucs\t0x00$code\n"); 1.189 + } 1.190 + #print "\t# $comment{$i}\n"; 1.191 + print "\n"; 1.192 +} 1.193 + 1.194 +print US sort(@uslist); 1.195 +print UJ sort(@ujlist); 1.196 +print UJE sort(@ujelist); 1.197 +print UJA sort(@ujalist); 1.198 +print UJRK sort(@ujrklist); 1.199 + 1.200 +# make ibmnec mapping 1.201 + 1.202 +print IBMNEC $NPL; 1.203 +print IBMNEC "/* generated by $ID */\n"; 1.204 +print IBMNEC "/* IBM ext codes to NEC sel (in CP932) */\n\n"; 1.205 + 1.206 +foreach $i (0xFA, 0xFB, 0xFC) { 1.207 + for ($j=( ($i==0xFA) ? 0x40 : 0x00 ); $j<=0xFF; $j++) { 1.208 + $ibm = sprintf("0x%02X%02X", $i, $j); 1.209 + $raw = substr($ibm, 2,6); 1.210 + if ("" == $ibmnec{$ibm}) { 1.211 + print IBMNEC "/* $raw:UNDEF */ 0, \n"; 1.212 + } else { 1.213 + print IBMNEC "/* $raw */ $ibmnec{$ibm}, \n"; 1.214 + } 1.215 + } 1.216 +} 1.217 + 1.218 +close(IBMNEC); 1.219 + 1.220 +# make jdx 1.221 + 1.222 +open (JDX, ">$jdxfile") || die; 1.223 + 1.224 +print JDX $NPL; 1.225 +print JDX "/* generated by $ID */\n"; 1.226 +print JDX "/* JIS X 0208 (with CP932 ext) to Unicode mapping */\n"; 1.227 + 1.228 +for ($i=0; $i<94; $i++) { 1.229 + printf JDX "/* 0x%2XXX */\n", ($i+0x21); 1.230 + printf JDX " "; 1.231 + for ($j=0; $j<94; $j++) { 1.232 + $jis = sprintf("0x%02X%02X", ($i+0x21), $j+0x21); 1.233 + # get JIS 1.234 + $ucs = $jisucs{$jis}; 1.235 + if ("" == $ucs) { 1.236 + # try CP932 ext 1.237 + # try jis ext 1.238 + $ucs = $jisextucs{$jis} 1.239 + } 1.240 + if ("" == $ucs) { 1.241 + # undefined 1.242 + print JDX "0xFFFD,"; 1.243 + } else { 1.244 + print JDX "$ucs,"; 1.245 + } 1.246 + if (7 == ( ($j+1) % 8 )) { 1.247 + printf JDX "/* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8; 1.248 + } 1.249 + } 1.250 + printf JDX " /* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8; 1.251 +} 1.252 + 1.253 +close (JDX); 1.254 + 1.255 + 1.256 +close(SU); 1.257 +close(US); 1.258 +close(JU); 1.259 +close(JEU); 1.260 +close(JAU); 1.261 +close(JRKU); 1.262 +close(UJ); 1.263 +close(UJE); 1.264 +close(UJA); 1.265 +close(UJRK); 1.266 + 1.267 +# generate uf files 1.268 + 1.269 +sub genuf { 1.270 + my ($infile, $outfile) = @_; 1.271 + my $com = "cat $infile | ./umaptable -uf > $gendir/$outfile"; 1.272 + print "Executing $com\n"; 1.273 + system($com); 1.274 +} 1.275 + 1.276 +genuf($sufile, "sjis.uf"); 1.277 +genuf($jufile, "jis0208.uf"); 1.278 +if ( $#ujelist > 0 ) { 1.279 + genuf($jeufile, "jis0208ext.uf"); 1.280 +} else { 1.281 + print "Extension is not found. jis0208ext.uf is not generated.\n"; 1.282 +} 1.283 +genuf("$jaufile $jrkufile", "jis0201.uf"); 1.284 +# genuf($jaufile, "jis0201.uf"); 1.285 +# genuf($jrkufile, "jis0201gl.uf"); 1.286 + 1.287 + 1.288 +# generate test page 1.289 + 1.290 + 1.291 +exit; 1.292 + 1.293 +sub sjistojis { 1.294 + my($sjis) = (@_); 1.295 + my($first,$second,$h, $l, $j0208); 1.296 + 1.297 + if ( $sjis !~ /^0x....$/ ) { 1.298 + return ""; 1.299 + } 1.300 + 1.301 + $first = hex(substr($sjis,2,2)); 1.302 + $second = hex(substr($sjis,4,2)); 1.303 + $jnum=0; 1.304 + 1.305 + if($first < 0xE0) 1.306 + { 1.307 + $jnum = ($first - 0x81) * ((0xfd - 0x80)+(0x7f - 0x40)); 1.308 + } else { 1.309 + $jnum = ($first - 0xe0 + (0xa0-0x81)) * ((0xfd - 0x80)+(0x7f - 0x40)); 1.310 + } 1.311 + if($second >= 0x80) 1.312 + { 1.313 + $jnum += $second - 0x80 + (0x7f-0x40); 1.314 + } 1.315 + else 1.316 + { 1.317 + $jnum += $second - 0x40; 1.318 + } 1.319 + if(($jnum / 94 ) < 94) { 1.320 + return sprintf "0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21); 1.321 + } else { 1.322 + #return sprintf "# 0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21); 1.323 + return ""; 1.324 + } 1.325 +} 1.326 +