1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/lwbrk/tools/anzx4051.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,356 @@ 1.4 +#!/usr/bin/perl 1.5 +# 1.6 +# This Source Code Form is subject to the terms of the Mozilla Public 1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.9 + 1.10 +###################################################################### 1.11 +# 1.12 +# Initial global variable 1.13 +# 1.14 +###################################################################### 1.15 +%utot = (); 1.16 +$ui=0; 1.17 +$li=0; 1.18 + 1.19 +###################################################################### 1.20 +# 1.21 +# Open the unicode database file 1.22 +# 1.23 +###################################################################### 1.24 +open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") 1.25 + || die "cannot find UnicodeData-Latest.txt"; 1.26 + 1.27 +###################################################################### 1.28 +# 1.29 +# Open the JIS X 4051 Class file 1.30 +# 1.31 +###################################################################### 1.32 +open ( CLASS , "< jisx4051class.txt") 1.33 + || die "cannot find jisx4051class.txt"; 1.34 + 1.35 +###################################################################### 1.36 +# 1.37 +# Open the JIS X 4051 Class simplified mapping 1.38 +# 1.39 +###################################################################### 1.40 +open ( SIMP , "< jisx4051simp.txt") 1.41 + || die "cannot find jisx4051simp.txt"; 1.42 + 1.43 +###################################################################### 1.44 +# 1.45 +# Open the output file 1.46 +# 1.47 +###################################################################### 1.48 +open ( OUT , "> anzx4051.html") 1.49 + || die "cannot open output anzx4051.html file"; 1.50 + 1.51 +###################################################################### 1.52 +# 1.53 +# Open the output file 1.54 +# 1.55 +###################################################################### 1.56 +open ( HEADER , "> ../src/jisx4051class.h") 1.57 + || die "cannot open output ../src/jisx4051class.h file"; 1.58 + 1.59 +###################################################################### 1.60 +# 1.61 +# Generate license and header 1.62 +# 1.63 +###################################################################### 1.64 +$hthmlheader = <<END_OF_HTML; 1.65 +<!-- This Source Code Form is subject to the terms of the Mozilla Public 1.66 + - License, v. 2.0. If a copy of the MPL was not distributed with this 1.67 + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> 1.68 + 1.69 +<HTML> 1.70 +<HEAD> 1.71 +<TITLE> 1.72 +Analysis of JIS X 4051 to Unicode General Category Mapping 1.73 +</TITLE> 1.74 +</HEAD> 1.75 +<BODY> 1.76 +<H1> 1.77 +Analysis of JIS X 4051 to Unicode General Category Mapping 1.78 +</H1> 1.79 +END_OF_HTML 1.80 +print OUT $hthmlheader; 1.81 + 1.82 +###################################################################### 1.83 +# 1.84 +# Generate license and header 1.85 +# 1.86 +###################################################################### 1.87 +$npl = <<END_OF_NPL; 1.88 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.89 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.90 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.91 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.92 +/* 1.93 + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY 1.94 + mozilla/intl/lwbrk/tools/anzx4051.pl 1.95 + */ 1.96 +END_OF_NPL 1.97 +print HEADER $npl; 1.98 + 1.99 +%occ = (); 1.100 +%gcat = (); 1.101 +%dcat = (); 1.102 +%simp = (); 1.103 +%gcount = (); 1.104 +%dcount = (); 1.105 +%sccount = (); 1.106 +%rangecount = (); 1.107 + 1.108 +###################################################################### 1.109 +# 1.110 +# Process the file line by line 1.111 +# 1.112 +###################################################################### 1.113 +while(<UNICODATA>) { 1.114 + chop; 1.115 + ###################################################################### 1.116 + # 1.117 + # Get value from fields 1.118 + # 1.119 + ###################################################################### 1.120 + @f = split(/;/ , $_); 1.121 + $c = $f[0]; # The unicode value 1.122 + $g = $f[2]; 1.123 + $d = substr($g, 0, 1); 1.124 + 1.125 + $gcat{$c} = $g; 1.126 + $dcat{$c} = $d; 1.127 + $gcount{$g}++; 1.128 + $dcount{$d}++; 1.129 +} 1.130 +close(UNIDATA); 1.131 + 1.132 +while(<SIMP>) { 1.133 + chop; 1.134 + ###################################################################### 1.135 + # 1.136 + # Get value from fields 1.137 + # 1.138 + ###################################################################### 1.139 + @f = split(/;/ , $_); 1.140 + 1.141 + $simp{$f[0]} = $f[1]; 1.142 + $sccount{$f[1]}++; 1.143 +} 1.144 +close(SIMP); 1.145 + 1.146 +sub GetClass{ 1.147 + my ($u) = @_; 1.148 + my $hex = DecToHex($u); 1.149 + $g = $gcat{$hex}; 1.150 + if($g ne "") { 1.151 + return $g; 1.152 + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { 1.153 + return "Han"; 1.154 + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { 1.155 + return "Lo"; 1.156 + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { 1.157 + return "Cs"; 1.158 + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { 1.159 + return "Cs"; 1.160 + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { 1.161 + return "Cs"; 1.162 + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { 1.163 + return "Co"; 1.164 + } else { 1.165 + printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; 1.166 + } 1.167 +} 1.168 +sub GetDClass{ 1.169 + my ($u) = @_; 1.170 + my $hex = DecToHex($u); 1.171 + $g = $dcat{$hex}; 1.172 + if($g ne "") { 1.173 + return $g; 1.174 + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { 1.175 + return "Han"; 1.176 + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { 1.177 + return "L"; 1.178 + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { 1.179 + return "C"; 1.180 + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { 1.181 + return "C"; 1.182 + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { 1.183 + return "C"; 1.184 + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { 1.185 + return "C"; 1.186 + } else { 1.187 + printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; 1.188 + } 1.189 +} 1.190 +sub DecToHex{ 1.191 + my ($d) = @_; 1.192 + return sprintf("%04X", $d); 1.193 +} 1.194 +%gtotal = (); 1.195 +%dtotal = (); 1.196 +while(<CLASS>) { 1.197 + chop; 1.198 + ###################################################################### 1.199 + # 1.200 + # Get value from fields 1.201 + # 1.202 + ###################################################################### 1.203 + @f = split(/;/ , $_); 1.204 + 1.205 + if( substr($f[2], 0, 1) ne "a") 1.206 + { 1.207 + $sc = $simp{$f[2]}; 1.208 + $l = hex($f[0]); 1.209 + if($f[1] eq "") 1.210 + { 1.211 + $h = $l; 1.212 + } else { 1.213 + $h = hex($f[1]); 1.214 + } 1.215 + for($k = $l; $k <= $h ; $k++) 1.216 + { 1.217 + if( exists($occ{$k})) 1.218 + { 1.219 + # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", 1.220 + # DecToHex($k), $occ{$k} , $f[2] , $sc; 1.221 + } 1.222 + else 1.223 + { 1.224 + $occ{$k} = $sc . " | " . $f[2]; 1.225 + $gclass = GetClass($k); 1.226 + $dclass = GetDClass($k); 1.227 + $gtotal{$sc . $gclass}++; 1.228 + $dtotal{$sc . $dclass}++; 1.229 + $u = DecToHex($k); 1.230 + $rk = " " . substr($u,0,2) . ":" . $sc; 1.231 + $rangecount{$rk}++; 1.232 + } 1.233 + } 1.234 + } 1.235 +} 1.236 + 1.237 +#print %gtotal; 1.238 +#print %dtotal; 1.239 + 1.240 +sub printreport 1.241 +{ 1.242 + print OUT "<TABLE BORDER=3>\n"; 1.243 + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; 1.244 + 1.245 + foreach $d (sort(keys %dcount)) { 1.246 + print OUT "<TD BGCOLOR=red>$d</TD>\n"; 1.247 + } 1.248 + 1.249 + print OUT "<TD BGCOLOR=white>Total</TD>\n"; 1.250 + foreach $g (sort(keys %gcount)) { 1.251 + print OUT "<TD BGCOLOR=yellow>$g</TD>\n"; 1.252 + } 1.253 + print OUT "</TR>\n"; 1.254 + foreach $sc (sort(keys %sccount)) { 1.255 + 1.256 + print OUT "<TR><TH>$sc<TH>\n"; 1.257 + 1.258 + $total = 0; 1.259 + foreach $d (sort (keys %dcount)) { 1.260 + $count = $dtotal{$sc . $d}; 1.261 + $total += $count; 1.262 + print OUT "<TD>$count</TD>\n"; 1.263 + } 1.264 + 1.265 + print OUT "<TD BGCOLOR=white>$total</TD>\n"; 1.266 + 1.267 + foreach $g (sort(keys %gcount)) { 1.268 + $count = $gtotal{$sc . $g}; 1.269 + print OUT "<TD>$count</TD>\n"; 1.270 + } 1.271 + 1.272 + 1.273 + print OUT "</TR>\n"; 1.274 + } 1.275 + print OUT "</TABLE>\n"; 1.276 + 1.277 + 1.278 + print OUT "<TABLE BORDER=3>\n"; 1.279 + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; 1.280 + 1.281 + foreach $sc (sort(keys %sccount)) 1.282 + { 1.283 + print OUT "<TD BGCOLOR=red>$sc</TD>\n"; 1.284 + } 1.285 + 1.286 + print OUT "</TR>\n"; 1.287 + 1.288 + 1.289 + for($rr = 0; $rr < 0x4f; $rr++) 1.290 + { 1.291 + $empty = 0; 1.292 + $r = sprintf("%02X" , $rr) ; 1.293 + $tmp = "<TR><TH>" . $r . "<TH>\n"; 1.294 + 1.295 + foreach $sc (sort(keys %sccount)) { 1.296 + $count = $rangecount{ " " .$r . ":" .$sc}; 1.297 + $tmp .= sprintf("<TD>%s</TD>\n", $count); 1.298 + $empty += $count; 1.299 + } 1.300 + 1.301 + $tmp .= "</TR>\n"; 1.302 + 1.303 + if($empty ne 0) 1.304 + { 1.305 + print OUT $tmp; 1.306 + } 1.307 + } 1.308 + print OUT "</TABLE>\n"; 1.309 + 1.310 +} 1.311 +printreport(); 1.312 + 1.313 +sub printarray 1.314 +{ 1.315 + my($r, $def) = @_; 1.316 +printf "[%s || %s]\n", $r, $def; 1.317 + $k = hex($r) * 256; 1.318 + printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; 1.319 + for($i = 0 ; $i < 256; $i+= 8) 1.320 + { 1.321 + for($j = 7 ; $j >= 0; $j-- ) 1.322 + { 1.323 + $v = $k + $i + $j; 1.324 + if( exists($occ{$v})) 1.325 + { 1.326 + $p = substr($occ{$v}, 1,1); 1.327 + } else { 1.328 + $p = $def; 1.329 + } 1.330 + 1.331 + if($j eq 7 ) 1.332 + { 1.333 + printf HEADER "0x%s" , $p; 1.334 + } else { 1.335 + printf HEADER "%s", $p ; 1.336 + } 1.337 + } 1.338 + printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); 1.339 + } 1.340 + print HEADER "};\n\n"; 1.341 +} 1.342 +printarray("00", "7"); 1.343 +printarray("20", "7"); 1.344 +printarray("21", "7"); 1.345 +printarray("30", "5"); 1.346 +printarray("0E", "8"); 1.347 +printarray("17", "7"); 1.348 + 1.349 +#print %rangecount; 1.350 + 1.351 +###################################################################### 1.352 +# 1.353 +# Close files 1.354 +# 1.355 +###################################################################### 1.356 +close(HEADER); 1.357 +close(CLASS); 1.358 +close(OUT); 1.359 +