intl/uconv/tools/mkjpconv.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/tools/mkjpconv.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,323 @@
     1.4 +#!/usr/bin/perl
     1.5 +$ID = "mkjpconv.pl @ARGV (Time-stamp: <2001-08-08 18:54:54 shom>)";
     1.6 +
     1.7 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
    1.10 +
    1.11 +#
    1.12 +# based on CP932.TXT from unicode.org
    1.13 +# additional information from SHIFTJIS.TXT from unicode.org
    1.14 +#
    1.15 +# mapping policy:
    1.16 +#   jis0208 to unicode : based on CP932
    1.17 +#   unicode to jis0208 : based on CP932
    1.18 +#                        the lowest code is used for dual mapping to jis0208
    1.19 +#   ascii region       : based on ISO8859-1 ( same as CP932 ) IGNORE?
    1.20 +#   kana region        : based on CP932
    1.21 +#   IBM Ext(0xFxxx>)   : premap to NEC region ( mappable to JIS )
    1.22 +
    1.23 +if ($ARGV[0] eq "") {
    1.24 +    print STDERR "usage: mkjpconv.pl SHIFTJIS.TXT <INFILE(ex:CP932.TXT)> [Another check]\n";
    1.25 +    exit 1;
    1.26 +}
    1.27 +
    1.28 +open (SI, "SHIFTJIS.TXT") || die;
    1.29 +while(<SI>) {
    1.30 +    ($hi,$lo) = /^0x(..)?(..)\s/;
    1.31 +    if ($lo eq "") { next; }
    1.32 +    if ($hi eq "") { $hi="  " }
    1.33 +    $defined{"0x$hi$lo"} = 1;
    1.34 +}
    1.35 +close (SI);
    1.36 +
    1.37 +shift(@ARGV);
    1.38 +
    1.39 +$src = $ARGV[0];
    1.40 +
    1.41 +$gendir = "$src.d";
    1.42 +mkdir("$src.d");
    1.43 +
    1.44 +$sufile = "sjis2ucs-$src.map";
    1.45 +$usfile = "ucs2sjis-$src.map";
    1.46 +$jufile = "jis2ucs-$src.map";
    1.47 +$jeufile = "jisext2ucs-$src.map";
    1.48 +$jaufile = "jisasc2ucs-$src.map";
    1.49 +$jrkufile = "jiskana2ucs-$src.map";
    1.50 +$ujfile = "ucs2jis-$src.map";
    1.51 +$ujefile = "ucs2jisext-$src.map";
    1.52 +$ujafile = "ucs2jisasc-$src.map";
    1.53 +$ujrkfile = "ucs2jiskana-$src.map";
    1.54 +$ibmnecfile = "$gendir/IBMNEC.map";
    1.55 +$jdxfile = "$gendir/jis0208.ump";
    1.56 +$jdxextfile = "jis0208ext.ump";
    1.57 +$commentfile = "comment-$src.txt";
    1.58 +
    1.59 +open (IN, "NPL.header") || die;
    1.60 +while(<IN>) {
    1.61 +    $NPL .= $_;
    1.62 +}
    1.63 +close (IN);
    1.64 +
    1.65 +foreach $infile ( @ARGV ) {
    1.66 +
    1.67 +    open (IN, "$infile") || die;
    1.68 +
    1.69 +    while(<IN>) {
    1.70 +	($from, $to, $seq, $dum, $comment) =
    1.71 +	    /^\s*(0x[0-9a-fA-F]+)\s+(0x[0-9a-fA-F]+)(\+0x\S+)?(\s+\#\s*(\S.*))?$/;
    1.72 +	if ( $seq ne "" ) {
    1.73 +	    print "Warning: Unicode Seq:\t$from\t$to$seq\t# $comment\n";
    1.74 +	}
    1.75 +
    1.76 +	if ( $from eq "" ) { next; }
    1.77 +	
    1.78 +	if ( $from =~ /0x(..)$/ ) {
    1.79 +	    $from = "  0x$1";
    1.80 +	}
    1.81 +	
    1.82 +	if ( $fromto{$from} eq "" ) {
    1.83 +	    push(@fromlist, $from);
    1.84 +	    $fromto{$from} = $to;
    1.85 +	    $commentbody{$from} = $comment;
    1.86 +	    $commentseq{$from} = $seq
    1.87 +	} elsif ( $fromto{$from} ne $to ) {
    1.88 +	    # another mappint SJIS:UCS2 = 1:N
    1.89 +	    print "Another map in $infile\t$from\t$fromto{$from},$to\n";
    1.90 +	}
    1.91 +
    1.92 +	if ($checkanother==1) {
    1.93 +	    next;
    1.94 +	}
    1.95 +
    1.96 +	if ( $tofrom{$to} eq "" ) {
    1.97 +	    $tofrom{$to} = $from;
    1.98 +	} else {
    1.99 +	    if ( $from !~ /$tofrom{$to}/ ){
   1.100 +	    $tofrom{$to} = "$tofrom{$to},$from";
   1.101 +	}
   1.102 +	}
   1.103 +    
   1.104 +	# print "$from $to\n";
   1.105 +    }
   1.106 +
   1.107 +    close (IN);
   1.108 +
   1.109 +    $checkanother == 1;
   1.110 +}
   1.111 +
   1.112 +open (COMMENT, ">$commentfile") || die;
   1.113 +foreach $from (sort(@fromlist)) {
   1.114 +    print COMMENT "$from\t$fromto{$from}$commentseq{$from}\t$commentbody{$from}\n";
   1.115 +}
   1.116 +close (COMMENT);
   1.117 +
   1.118 +
   1.119 +open(SU, ">$sufile") || die;
   1.120 +open(US, ">$usfile") || die;
   1.121 +open(JU, ">$jufile") || die;
   1.122 +open(JEU, ">$jeufile") || die;
   1.123 +open(JAU, ">$jaufile") || die;
   1.124 +open(JRKU, ">$jrkufile") || die;
   1.125 +open(UJ, ">$ujfile") || die;
   1.126 +open(UJE, ">$ujefile") || die;
   1.127 +open(UJA, ">$ujafile") || die;
   1.128 +open(UJRK, ">$ujrkfile") || die;
   1.129 +open(IBMNEC, ">$ibmnecfile") || die;
   1.130 +
   1.131 +# print SU "/* generated from $src : SJIS UCS2 */\n";
   1.132 +# print US "/* generated from $src : UCS2 SJIS */\n";
   1.133 +print "Generated from $src\n";
   1.134 +print "Command: mkjpconv.pl @ARGV\n";
   1.135 +print "SJIS(JIS)\tUCS2\tSJIS\tS:U:S\tSJIS lower\n";
   1.136 +
   1.137 +foreach $i (sort(@fromlist)) {
   1.138 +
   1.139 +    $ucs = "";
   1.140 +
   1.141 +    $sjis = $i;
   1.142 +    $sjis =~ s/\s+//;
   1.143 +    $jis = sjistojis($sjis);
   1.144 +
   1.145 +    print "$i($jis)\t$fromto{$i}\t$tofrom{$fromto{$i}}";
   1.146 +    $ucs = $fromto{$i};
   1.147 +
   1.148 +    if ( $i eq $tofrom{$fromto{$i}} ) {
   1.149 +	print "\t1:1:1";
   1.150 +	print "\t$i";
   1.151 +    } else {
   1.152 +	print "\t1:1:N";
   1.153 +	@tolist = split(/,/,$tofrom{$fromto{$i}});
   1.154 +	print "\t$tolist[0]";
   1.155 +	#$ucs = $tolist[0];
   1.156 +	if ( $sjis =~ /0xF[A-D]../ ) {
   1.157 +	    $ibmnec{$sjis} = $tolist[0];
   1.158 +	    #print IBMNEC "$sjis\t$tolist[0]\n";
   1.159 +	}
   1.160 +
   1.161 +    }
   1.162 +    print SU "$sjis\t$ucs\n";
   1.163 +    push(@uslist, "$ucs\t$sjis\n");
   1.164 +
   1.165 +    #print US "$ucs\t$sjis\n";
   1.166 +    if ( $jis ne "") {
   1.167 +	#if ($sjis =~ /^0x87../ || $sjis =~ /^0xED../ ) {
   1.168 +	    # cp932 ext
   1.169 +	if ($sjis =~ /0x..../ && $defined{$sjis} != 1) {
   1.170 +	    # jis not define
   1.171 +	    print JEU "$jis\t$ucs\n";
   1.172 +	    push(@ujelist, "$ucs\t$jis\n");
   1.173 +	    $jisextucs{$jis} = $ucs;
   1.174 +	} else {
   1.175 +	    print JU "$jis\t$ucs\n";
   1.176 +	    push(@ujlist, "$ucs\t$jis\n");
   1.177 +	    $jisucs{$jis} = $ucs;
   1.178 +	}
   1.179 +
   1.180 +	#print UJ "$ucs\t$jis\n";
   1.181 +    } elsif ( $sjis =~ /\s*0x([8-9A-D].)/ ) {
   1.182 +	$code = $1;
   1.183 +	print JRKU "0x00$code\t$ucs\n";
   1.184 +	push(@ujrklist, "$ucs\t0x00$code\n");
   1.185 +    } elsif ( $sjis =~ /\s*0x([0-7].)/ ) {
   1.186 +	$code = $1;
   1.187 +	print JAU "0x00$code\t$ucs\n";
   1.188 +	push(@ujalist, "$ucs\t0x00$code\n");
   1.189 +    }
   1.190 +    #print "\t# $comment{$i}\n";
   1.191 +    print "\n";
   1.192 +}
   1.193 +
   1.194 +print US sort(@uslist);
   1.195 +print UJ sort(@ujlist);
   1.196 +print UJE sort(@ujelist);
   1.197 +print UJA sort(@ujalist);
   1.198 +print UJRK sort(@ujrklist);
   1.199 +
   1.200 +# make ibmnec mapping
   1.201 +
   1.202 +print IBMNEC $NPL;
   1.203 +print IBMNEC "/* generated by $ID */\n";
   1.204 +print IBMNEC "/* IBM ext codes to NEC sel (in CP932) */\n\n";
   1.205 +
   1.206 +foreach $i (0xFA, 0xFB, 0xFC) {
   1.207 +    for ($j=( ($i==0xFA) ? 0x40 : 0x00 ); $j<=0xFF; $j++) {
   1.208 +	$ibm = sprintf("0x%02X%02X", $i, $j);
   1.209 +	$raw = substr($ibm, 2,6);
   1.210 +	if ("" == $ibmnec{$ibm}) {
   1.211 +	    print IBMNEC "/* $raw:UNDEF */ 0, \n";
   1.212 +	} else {
   1.213 +	    print IBMNEC "/* $raw */ $ibmnec{$ibm}, \n";
   1.214 +	}
   1.215 +    }
   1.216 +}
   1.217 +
   1.218 +close(IBMNEC);
   1.219 +
   1.220 +# make jdx
   1.221 +
   1.222 +open (JDX, ">$jdxfile") || die;
   1.223 +    
   1.224 +print JDX $NPL;
   1.225 +print JDX "/* generated by $ID */\n";
   1.226 +print JDX "/* JIS X 0208 (with CP932 ext) to Unicode mapping */\n";
   1.227 +
   1.228 +for ($i=0; $i<94; $i++) {
   1.229 +    printf JDX "/* 0x%2XXX */\n", ($i+0x21);
   1.230 +    printf JDX "       ";
   1.231 +    for ($j=0; $j<94; $j++) {
   1.232 +	$jis = sprintf("0x%02X%02X", ($i+0x21), $j+0x21);
   1.233 +	# get JIS
   1.234 +	$ucs = $jisucs{$jis};
   1.235 +	if ("" == $ucs) {
   1.236 +	    # try CP932 ext
   1.237 +	    # try jis ext
   1.238 +	    $ucs = $jisextucs{$jis}
   1.239 +	}
   1.240 +	if ("" == $ucs) {
   1.241 +	    # undefined
   1.242 +	    print JDX "0xFFFD,";
   1.243 +	} else {
   1.244 +	    print JDX "$ucs,";
   1.245 +	}
   1.246 +	if (7 == ( ($j+1) % 8 )) {
   1.247 +	    printf JDX "/* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8;
   1.248 +	}
   1.249 +    }
   1.250 +    printf JDX "       /* 0x%2X%1X%1X*/\n", $i+0x21, 2+($j/16), (6==($j%16))?0:8;
   1.251 +}
   1.252 +
   1.253 +close (JDX);
   1.254 +
   1.255 +
   1.256 +close(SU);
   1.257 +close(US);
   1.258 +close(JU);
   1.259 +close(JEU);
   1.260 +close(JAU);
   1.261 +close(JRKU);
   1.262 +close(UJ);
   1.263 +close(UJE);
   1.264 +close(UJA);
   1.265 +close(UJRK);
   1.266 +
   1.267 +# generate uf files
   1.268 +
   1.269 +sub genuf {
   1.270 +    my ($infile, $outfile) = @_;
   1.271 +    my $com = "cat $infile | ./umaptable -uf > $gendir/$outfile";
   1.272 +    print "Executing $com\n";
   1.273 +    system($com);
   1.274 +}
   1.275 +
   1.276 +genuf($sufile, "sjis.uf");
   1.277 +genuf($jufile, "jis0208.uf");
   1.278 +if ( $#ujelist > 0 ) {
   1.279 +    genuf($jeufile, "jis0208ext.uf");
   1.280 +} else {
   1.281 +    print "Extension is not found. jis0208ext.uf is not generated.\n";
   1.282 +}
   1.283 +genuf("$jaufile $jrkufile", "jis0201.uf");
   1.284 +# genuf($jaufile, "jis0201.uf");
   1.285 +# genuf($jrkufile, "jis0201gl.uf");
   1.286 +
   1.287 +
   1.288 +# generate test page
   1.289 +
   1.290 +
   1.291 +exit;
   1.292 +
   1.293 +sub sjistojis {
   1.294 +   my($sjis) = (@_);
   1.295 +   my($first,$second,$h, $l, $j0208);
   1.296 +
   1.297 +   if ( $sjis !~ /^0x....$/ ) {
   1.298 +       return "";
   1.299 +   }
   1.300 +
   1.301 +   $first = hex(substr($sjis,2,2));
   1.302 +   $second = hex(substr($sjis,4,2));
   1.303 +   $jnum=0;
   1.304 +
   1.305 +   if($first < 0xE0)
   1.306 +   {
   1.307 +       $jnum = ($first - 0x81) * ((0xfd - 0x80)+(0x7f - 0x40));
   1.308 +   } else {
   1.309 +       $jnum = ($first - 0xe0 + (0xa0-0x81)) * ((0xfd - 0x80)+(0x7f - 0x40));
   1.310 +   }
   1.311 +   if($second >= 0x80)
   1.312 +   {
   1.313 +       $jnum += $second - 0x80 + (0x7f-0x40);
   1.314 +   }
   1.315 +   else
   1.316 +   {
   1.317 +       $jnum += $second - 0x40;
   1.318 +   }
   1.319 +   if(($jnum / 94 ) < 94) {
   1.320 +       return sprintf "0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21);
   1.321 +   } else {
   1.322 +       #return sprintf "# 0x%02X%02X", (($jnum / 94) + 0x21), (($jnum % 94)+0x21);
   1.323 +       return "";
   1.324 +   }
   1.325 +}
   1.326 +

mercurial