intl/lwbrk/tools/anzx4051.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/lwbrk/tools/anzx4051.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,356 @@
     1.4 +#!/usr/bin/perl 
     1.5 +#
     1.6 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.9 +
    1.10 +######################################################################
    1.11 +#
    1.12 +# Initial global variable
    1.13 +#
    1.14 +######################################################################
    1.15 +%utot = ();
    1.16 +$ui=0;
    1.17 +$li=0;
    1.18 +
    1.19 +######################################################################
    1.20 +#
    1.21 +# Open the unicode database file
    1.22 +#
    1.23 +######################################################################
    1.24 +open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") 
    1.25 +   || die "cannot find UnicodeData-Latest.txt";
    1.26 +
    1.27 +######################################################################
    1.28 +#
    1.29 +# Open the JIS X 4051 Class file
    1.30 +#
    1.31 +######################################################################
    1.32 +open ( CLASS , "< jisx4051class.txt") 
    1.33 +   || die "cannot find jisx4051class.txt";
    1.34 +
    1.35 +######################################################################
    1.36 +#
    1.37 +# Open the JIS X 4051 Class simplified mapping
    1.38 +#
    1.39 +######################################################################
    1.40 +open ( SIMP , "< jisx4051simp.txt") 
    1.41 +   || die "cannot find jisx4051simp.txt";
    1.42 +
    1.43 +######################################################################
    1.44 +#
    1.45 +# Open the output file
    1.46 +#
    1.47 +######################################################################
    1.48 +open ( OUT , "> anzx4051.html") 
    1.49 +  || die "cannot open output anzx4051.html file";
    1.50 +
    1.51 +######################################################################
    1.52 +#
    1.53 +# Open the output file
    1.54 +#
    1.55 +######################################################################
    1.56 +open ( HEADER , "> ../src/jisx4051class.h") 
    1.57 +  || die "cannot open output ../src/jisx4051class.h file";
    1.58 +
    1.59 +######################################################################
    1.60 +#
    1.61 +# Generate license and header
    1.62 +#
    1.63 +######################################################################
    1.64 +$hthmlheader = <<END_OF_HTML;
    1.65 +<!-- This Source Code Form is subject to the terms of the Mozilla Public
    1.66 +   - License, v. 2.0. If a copy of the MPL was not distributed with this
    1.67 +   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
    1.68 +
    1.69 +<HTML>
    1.70 +<HEAD>
    1.71 +<TITLE>
    1.72 +Analysis of JIS X 4051 to Unicode General Category Mapping
    1.73 +</TITLE>
    1.74 +</HEAD>
    1.75 +<BODY>
    1.76 +<H1>
    1.77 +Analysis of JIS X 4051 to Unicode General Category Mapping
    1.78 +</H1>
    1.79 +END_OF_HTML
    1.80 +print OUT $hthmlheader;
    1.81 +
    1.82 +######################################################################
    1.83 +#
    1.84 +# Generate license and header
    1.85 +#
    1.86 +######################################################################
    1.87 +$npl = <<END_OF_NPL;
    1.88 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
    1.89 +/* This Source Code Form is subject to the terms of the Mozilla Public
    1.90 + * License, v. 2.0. If a copy of the MPL was not distributed with this
    1.91 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
    1.92 +/* 
    1.93 +    DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
    1.94 +    mozilla/intl/lwbrk/tools/anzx4051.pl
    1.95 + */
    1.96 +END_OF_NPL
    1.97 +print HEADER $npl;
    1.98 +
    1.99 +%occ = ();
   1.100 +%gcat = ();
   1.101 +%dcat = ();
   1.102 +%simp = ();
   1.103 +%gcount = ();
   1.104 +%dcount = ();
   1.105 +%sccount = ();
   1.106 +%rangecount = ();
   1.107 +
   1.108 +######################################################################
   1.109 +#
   1.110 +# Process the file line by line
   1.111 +#
   1.112 +######################################################################
   1.113 +while(<UNICODATA>) {
   1.114 +   chop;
   1.115 +   ######################################################################
   1.116 +   #
   1.117 +   # Get value from fields
   1.118 +   #
   1.119 +   ######################################################################
   1.120 +   @f = split(/;/ , $_); 
   1.121 +   $c = $f[0];   # The unicode value
   1.122 +   $g = $f[2]; 
   1.123 +   $d = substr($g, 0, 1);
   1.124 +
   1.125 +   $gcat{$c} = $g;
   1.126 +   $dcat{$c} = $d;
   1.127 +   $gcount{$g}++;
   1.128 +   $dcount{$d}++;
   1.129 +}
   1.130 +close(UNIDATA);
   1.131 +
   1.132 +while(<SIMP>) {
   1.133 +   chop;
   1.134 +   ######################################################################
   1.135 +   #
   1.136 +   # Get value from fields
   1.137 +   #
   1.138 +   ######################################################################
   1.139 +   @f = split(/;/ , $_); 
   1.140 +
   1.141 +   $simp{$f[0]} = $f[1];
   1.142 +   $sccount{$f[1]}++;
   1.143 +}
   1.144 +close(SIMP);
   1.145 +
   1.146 +sub GetClass{
   1.147 +  my ($u) = @_;
   1.148 +  my $hex = DecToHex($u);
   1.149 +  $g = $gcat{$hex};
   1.150 +  if($g ne "") {
   1.151 +    return $g;
   1.152 +  } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 )  ) {
   1.153 +    return "Han";
   1.154 +  } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 )  ) {
   1.155 +    return "Lo";
   1.156 +  } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f )  ) {
   1.157 +    return "Cs";
   1.158 +  } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff )  ) {
   1.159 +    return "Cs";
   1.160 +  } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff )  ) {
   1.161 +    return "Cs";
   1.162 +  } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff )  ) {
   1.163 +    return "Co";
   1.164 +  } else {
   1.165 +    printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
   1.166 +  }
   1.167 +}
   1.168 +sub GetDClass{
   1.169 +  my ($u) = @_;
   1.170 +  my $hex = DecToHex($u);
   1.171 +  $g = $dcat{$hex};
   1.172 +  if($g ne "") {
   1.173 +    return $g;
   1.174 +  } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 )  ) {
   1.175 +    return "Han";
   1.176 +  } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 )  ) {
   1.177 +    return "L";
   1.178 +  } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f )  ) {
   1.179 +    return "C";
   1.180 +  } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff )  ) {
   1.181 +    return "C";
   1.182 +  } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff )  ) {
   1.183 +    return "C";
   1.184 +  } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff )  ) {
   1.185 +    return "C";
   1.186 +  } else {
   1.187 +    printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
   1.188 +  }
   1.189 +}
   1.190 +sub DecToHex{
   1.191 +     my ($d) = @_;
   1.192 +     return sprintf("%04X", $d); 
   1.193 +}
   1.194 +%gtotal = ();
   1.195 +%dtotal = ();
   1.196 +while(<CLASS>) {
   1.197 +   chop;
   1.198 +   ######################################################################
   1.199 +   #
   1.200 +   # Get value from fields
   1.201 +   #
   1.202 +   ######################################################################
   1.203 +   @f = split(/;/ , $_); 
   1.204 +
   1.205 +   if( substr($f[2], 0, 1) ne "a")
   1.206 +   {
   1.207 +     $sc = $simp{$f[2]};
   1.208 +     $l = hex($f[0]);
   1.209 +     if($f[1] eq "")
   1.210 +     {
   1.211 +       $h = $l;
   1.212 +     } else {
   1.213 +       $h = hex($f[1]);
   1.214 +     }
   1.215 +     for($k = $l; $k <= $h ; $k++)
   1.216 +     {
   1.217 +       if( exists($occ{$k}))
   1.218 +       {
   1.219 +          #  printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", 
   1.220 +          #         DecToHex($k),  $occ{$k} , $f[2] , $sc;
   1.221 +       }
   1.222 +       else
   1.223 +       {
   1.224 +           $occ{$k} = $sc . " | " . $f[2];
   1.225 +           $gclass = GetClass($k); 
   1.226 +           $dclass = GetDClass($k);
   1.227 +           $gtotal{$sc . $gclass}++;
   1.228 +           $dtotal{$sc . $dclass}++;
   1.229 +           $u = DecToHex($k);
   1.230 +           $rk = " " . substr($u,0,2) . ":" . $sc;
   1.231 +           $rangecount{$rk}++;
   1.232 +       }
   1.233 +     }
   1.234 +  }
   1.235 +}
   1.236 +
   1.237 +#print %gtotal;
   1.238 +#print %dtotal;
   1.239 +
   1.240 +sub printreport 
   1.241 +{
   1.242 +    print OUT "<TABLE BORDER=3>\n";
   1.243 +    print OUT "<TR BGCOLOR=blue><TH><TH>\n";
   1.244 +    
   1.245 +    foreach $d (sort(keys %dcount)) {
   1.246 +       print OUT "<TD BGCOLOR=red>$d</TD>\n";
   1.247 +    }
   1.248 +    
   1.249 +    print OUT "<TD BGCOLOR=white>Total</TD>\n";
   1.250 +    foreach $g (sort(keys %gcount)) {
   1.251 +       print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
   1.252 +    }
   1.253 +    print OUT "</TR>\n";
   1.254 +    foreach $sc (sort(keys %sccount)) {
   1.255 +    
   1.256 +       print OUT "<TR><TH>$sc<TH>\n";
   1.257 +    
   1.258 +       $total = 0; 
   1.259 +       foreach $d (sort (keys %dcount)) {
   1.260 +         $count = $dtotal{$sc . $d};
   1.261 +         $total += $count;
   1.262 +         print OUT "<TD>$count</TD>\n";
   1.263 +       }
   1.264 +    
   1.265 +       print OUT "<TD BGCOLOR=white>$total</TD>\n";
   1.266 +    
   1.267 +       foreach $g (sort(keys %gcount)) {
   1.268 +         $count = $gtotal{$sc . $g};
   1.269 +         print OUT "<TD>$count</TD>\n";
   1.270 +       }
   1.271 +    
   1.272 +    
   1.273 +       print OUT "</TR>\n";
   1.274 +    }
   1.275 +    print OUT "</TABLE>\n";
   1.276 +    
   1.277 +    
   1.278 +    print OUT "<TABLE BORDER=3>\n";
   1.279 +    print OUT "<TR BGCOLOR=blue><TH><TH>\n";
   1.280 +    
   1.281 +    foreach $sc (sort(keys %sccount)) 
   1.282 +    {
   1.283 +       print OUT "<TD BGCOLOR=red>$sc</TD>\n";
   1.284 +    }
   1.285 +    
   1.286 +    print OUT "</TR>\n";
   1.287 +    
   1.288 +    
   1.289 +    for($rr = 0; $rr < 0x4f; $rr++)
   1.290 +    {
   1.291 +       $empty = 0;
   1.292 +       $r = sprintf("%02X" , $rr) ;
   1.293 +       $tmp = "<TR><TH>" . $r . "<TH>\n";
   1.294 +    
   1.295 +       foreach $sc (sort(keys %sccount)) {
   1.296 +         $count = $rangecount{ " " .$r . ":" .$sc};
   1.297 +         $tmp .= sprintf("<TD>%s</TD>\n", $count);
   1.298 +         $empty += $count;
   1.299 +       }
   1.300 +    
   1.301 +       $tmp .=  "</TR>\n";
   1.302 +    
   1.303 +       if($empty ne 0) 
   1.304 +       {
   1.305 +          print OUT $tmp;
   1.306 +       }
   1.307 +    }
   1.308 +    print OUT "</TABLE>\n";
   1.309 +    
   1.310 +}
   1.311 +printreport();
   1.312 +
   1.313 +sub printarray
   1.314 +{
   1.315 +   my($r, $def) = @_;
   1.316 +printf "[%s || %s]\n", $r, $def;
   1.317 +   $k = hex($r) * 256;
   1.318 +   printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
   1.319 +   for($i = 0 ; $i < 256; $i+= 8)
   1.320 +   {  
   1.321 +      for($j = 7 ; $j >= 0; $j-- )
   1.322 +      {  
   1.323 +          $v = $k + $i + $j;
   1.324 +          if( exists($occ{$v})) 
   1.325 +	  {
   1.326 +             $p = substr($occ{$v}, 1,1);
   1.327 +          } else {
   1.328 +             $p = $def;
   1.329 +          }
   1.330 +
   1.331 +          if($j eq 7 ) 
   1.332 +          {
   1.333 +             printf HEADER "0x%s" , $p;
   1.334 +          } else {
   1.335 +             printf HEADER "%s", $p ;
   1.336 +          }
   1.337 +      }
   1.338 +      printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
   1.339 +   }
   1.340 +   print HEADER "};\n\n";
   1.341 +}
   1.342 +printarray("00", "7");
   1.343 +printarray("20", "7");
   1.344 +printarray("21", "7");
   1.345 +printarray("30", "5");
   1.346 +printarray("0E", "8");
   1.347 +printarray("17", "7");
   1.348 +
   1.349 +#print %rangecount;
   1.350 +
   1.351 +######################################################################
   1.352 +#
   1.353 +# Close files
   1.354 +#
   1.355 +######################################################################
   1.356 +close(HEADER);
   1.357 +close(CLASS);
   1.358 +close(OUT);
   1.359 +

mercurial