|
1 #!/usr/bin/perl |
|
2 # |
|
3 # This Source Code Form is subject to the terms of the Mozilla Public |
|
4 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
6 |
|
7 ###################################################################### |
|
8 # |
|
9 # Initial global variable |
|
10 # |
|
11 ###################################################################### |
|
12 %utot = (); |
|
13 $ui=0; |
|
14 $li=0; |
|
15 |
|
16 ###################################################################### |
|
17 # |
|
18 # Open the unicode database file |
|
19 # |
|
20 ###################################################################### |
|
21 open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") |
|
22 || die "cannot find UnicodeData-Latest.txt"; |
|
23 |
|
24 ###################################################################### |
|
25 # |
|
26 # Open the JIS X 4051 Class file |
|
27 # |
|
28 ###################################################################### |
|
29 open ( CLASS , "< jisx4051class.txt") |
|
30 || die "cannot find jisx4051class.txt"; |
|
31 |
|
32 ###################################################################### |
|
33 # |
|
34 # Open the JIS X 4051 Class simplified mapping |
|
35 # |
|
36 ###################################################################### |
|
37 open ( SIMP , "< jisx4051simp.txt") |
|
38 || die "cannot find jisx4051simp.txt"; |
|
39 |
|
40 ###################################################################### |
|
41 # |
|
42 # Open the output file |
|
43 # |
|
44 ###################################################################### |
|
45 open ( OUT , "> anzx4051.html") |
|
46 || die "cannot open output anzx4051.html file"; |
|
47 |
|
48 ###################################################################### |
|
49 # |
|
50 # Open the output file |
|
51 # |
|
52 ###################################################################### |
|
53 open ( HEADER , "> ../src/jisx4051class.h") |
|
54 || die "cannot open output ../src/jisx4051class.h file"; |
|
55 |
|
56 ###################################################################### |
|
57 # |
|
58 # Generate license and header |
|
59 # |
|
60 ###################################################################### |
|
61 $hthmlheader = <<END_OF_HTML; |
|
62 <!-- This Source Code Form is subject to the terms of the Mozilla Public |
|
63 - License, v. 2.0. If a copy of the MPL was not distributed with this |
|
64 - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> |
|
65 |
|
66 <HTML> |
|
67 <HEAD> |
|
68 <TITLE> |
|
69 Analysis of JIS X 4051 to Unicode General Category Mapping |
|
70 </TITLE> |
|
71 </HEAD> |
|
72 <BODY> |
|
73 <H1> |
|
74 Analysis of JIS X 4051 to Unicode General Category Mapping |
|
75 </H1> |
|
76 END_OF_HTML |
|
77 print OUT $hthmlheader; |
|
78 |
|
79 ###################################################################### |
|
80 # |
|
81 # Generate license and header |
|
82 # |
|
83 ###################################################################### |
|
84 $npl = <<END_OF_NPL; |
|
85 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
86 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
87 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
88 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
89 /* |
|
90 DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY |
|
91 mozilla/intl/lwbrk/tools/anzx4051.pl |
|
92 */ |
|
93 END_OF_NPL |
|
94 print HEADER $npl; |
|
95 |
|
96 %occ = (); |
|
97 %gcat = (); |
|
98 %dcat = (); |
|
99 %simp = (); |
|
100 %gcount = (); |
|
101 %dcount = (); |
|
102 %sccount = (); |
|
103 %rangecount = (); |
|
104 |
|
105 ###################################################################### |
|
106 # |
|
107 # Process the file line by line |
|
108 # |
|
109 ###################################################################### |
|
110 while(<UNICODATA>) { |
|
111 chop; |
|
112 ###################################################################### |
|
113 # |
|
114 # Get value from fields |
|
115 # |
|
116 ###################################################################### |
|
117 @f = split(/;/ , $_); |
|
118 $c = $f[0]; # The unicode value |
|
119 $g = $f[2]; |
|
120 $d = substr($g, 0, 1); |
|
121 |
|
122 $gcat{$c} = $g; |
|
123 $dcat{$c} = $d; |
|
124 $gcount{$g}++; |
|
125 $dcount{$d}++; |
|
126 } |
|
127 close(UNIDATA); |
|
128 |
|
129 while(<SIMP>) { |
|
130 chop; |
|
131 ###################################################################### |
|
132 # |
|
133 # Get value from fields |
|
134 # |
|
135 ###################################################################### |
|
136 @f = split(/;/ , $_); |
|
137 |
|
138 $simp{$f[0]} = $f[1]; |
|
139 $sccount{$f[1]}++; |
|
140 } |
|
141 close(SIMP); |
|
142 |
|
143 sub GetClass{ |
|
144 my ($u) = @_; |
|
145 my $hex = DecToHex($u); |
|
146 $g = $gcat{$hex}; |
|
147 if($g ne "") { |
|
148 return $g; |
|
149 } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { |
|
150 return "Han"; |
|
151 } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { |
|
152 return "Lo"; |
|
153 } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { |
|
154 return "Cs"; |
|
155 } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { |
|
156 return "Cs"; |
|
157 } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { |
|
158 return "Cs"; |
|
159 } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { |
|
160 return "Co"; |
|
161 } else { |
|
162 printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; |
|
163 } |
|
164 } |
|
165 sub GetDClass{ |
|
166 my ($u) = @_; |
|
167 my $hex = DecToHex($u); |
|
168 $g = $dcat{$hex}; |
|
169 if($g ne "") { |
|
170 return $g; |
|
171 } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { |
|
172 return "Han"; |
|
173 } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { |
|
174 return "L"; |
|
175 } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { |
|
176 return "C"; |
|
177 } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { |
|
178 return "C"; |
|
179 } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { |
|
180 return "C"; |
|
181 } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { |
|
182 return "C"; |
|
183 } else { |
|
184 printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; |
|
185 } |
|
186 } |
|
187 sub DecToHex{ |
|
188 my ($d) = @_; |
|
189 return sprintf("%04X", $d); |
|
190 } |
|
191 %gtotal = (); |
|
192 %dtotal = (); |
|
193 while(<CLASS>) { |
|
194 chop; |
|
195 ###################################################################### |
|
196 # |
|
197 # Get value from fields |
|
198 # |
|
199 ###################################################################### |
|
200 @f = split(/;/ , $_); |
|
201 |
|
202 if( substr($f[2], 0, 1) ne "a") |
|
203 { |
|
204 $sc = $simp{$f[2]}; |
|
205 $l = hex($f[0]); |
|
206 if($f[1] eq "") |
|
207 { |
|
208 $h = $l; |
|
209 } else { |
|
210 $h = hex($f[1]); |
|
211 } |
|
212 for($k = $l; $k <= $h ; $k++) |
|
213 { |
|
214 if( exists($occ{$k})) |
|
215 { |
|
216 # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", |
|
217 # DecToHex($k), $occ{$k} , $f[2] , $sc; |
|
218 } |
|
219 else |
|
220 { |
|
221 $occ{$k} = $sc . " | " . $f[2]; |
|
222 $gclass = GetClass($k); |
|
223 $dclass = GetDClass($k); |
|
224 $gtotal{$sc . $gclass}++; |
|
225 $dtotal{$sc . $dclass}++; |
|
226 $u = DecToHex($k); |
|
227 $rk = " " . substr($u,0,2) . ":" . $sc; |
|
228 $rangecount{$rk}++; |
|
229 } |
|
230 } |
|
231 } |
|
232 } |
|
233 |
|
234 #print %gtotal; |
|
235 #print %dtotal; |
|
236 |
|
237 sub printreport |
|
238 { |
|
239 print OUT "<TABLE BORDER=3>\n"; |
|
240 print OUT "<TR BGCOLOR=blue><TH><TH>\n"; |
|
241 |
|
242 foreach $d (sort(keys %dcount)) { |
|
243 print OUT "<TD BGCOLOR=red>$d</TD>\n"; |
|
244 } |
|
245 |
|
246 print OUT "<TD BGCOLOR=white>Total</TD>\n"; |
|
247 foreach $g (sort(keys %gcount)) { |
|
248 print OUT "<TD BGCOLOR=yellow>$g</TD>\n"; |
|
249 } |
|
250 print OUT "</TR>\n"; |
|
251 foreach $sc (sort(keys %sccount)) { |
|
252 |
|
253 print OUT "<TR><TH>$sc<TH>\n"; |
|
254 |
|
255 $total = 0; |
|
256 foreach $d (sort (keys %dcount)) { |
|
257 $count = $dtotal{$sc . $d}; |
|
258 $total += $count; |
|
259 print OUT "<TD>$count</TD>\n"; |
|
260 } |
|
261 |
|
262 print OUT "<TD BGCOLOR=white>$total</TD>\n"; |
|
263 |
|
264 foreach $g (sort(keys %gcount)) { |
|
265 $count = $gtotal{$sc . $g}; |
|
266 print OUT "<TD>$count</TD>\n"; |
|
267 } |
|
268 |
|
269 |
|
270 print OUT "</TR>\n"; |
|
271 } |
|
272 print OUT "</TABLE>\n"; |
|
273 |
|
274 |
|
275 print OUT "<TABLE BORDER=3>\n"; |
|
276 print OUT "<TR BGCOLOR=blue><TH><TH>\n"; |
|
277 |
|
278 foreach $sc (sort(keys %sccount)) |
|
279 { |
|
280 print OUT "<TD BGCOLOR=red>$sc</TD>\n"; |
|
281 } |
|
282 |
|
283 print OUT "</TR>\n"; |
|
284 |
|
285 |
|
286 for($rr = 0; $rr < 0x4f; $rr++) |
|
287 { |
|
288 $empty = 0; |
|
289 $r = sprintf("%02X" , $rr) ; |
|
290 $tmp = "<TR><TH>" . $r . "<TH>\n"; |
|
291 |
|
292 foreach $sc (sort(keys %sccount)) { |
|
293 $count = $rangecount{ " " .$r . ":" .$sc}; |
|
294 $tmp .= sprintf("<TD>%s</TD>\n", $count); |
|
295 $empty += $count; |
|
296 } |
|
297 |
|
298 $tmp .= "</TR>\n"; |
|
299 |
|
300 if($empty ne 0) |
|
301 { |
|
302 print OUT $tmp; |
|
303 } |
|
304 } |
|
305 print OUT "</TABLE>\n"; |
|
306 |
|
307 } |
|
308 printreport(); |
|
309 |
|
310 sub printarray |
|
311 { |
|
312 my($r, $def) = @_; |
|
313 printf "[%s || %s]\n", $r, $def; |
|
314 $k = hex($r) * 256; |
|
315 printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; |
|
316 for($i = 0 ; $i < 256; $i+= 8) |
|
317 { |
|
318 for($j = 7 ; $j >= 0; $j-- ) |
|
319 { |
|
320 $v = $k + $i + $j; |
|
321 if( exists($occ{$v})) |
|
322 { |
|
323 $p = substr($occ{$v}, 1,1); |
|
324 } else { |
|
325 $p = $def; |
|
326 } |
|
327 |
|
328 if($j eq 7 ) |
|
329 { |
|
330 printf HEADER "0x%s" , $p; |
|
331 } else { |
|
332 printf HEADER "%s", $p ; |
|
333 } |
|
334 } |
|
335 printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); |
|
336 } |
|
337 print HEADER "};\n\n"; |
|
338 } |
|
339 printarray("00", "7"); |
|
340 printarray("20", "7"); |
|
341 printarray("21", "7"); |
|
342 printarray("30", "5"); |
|
343 printarray("0E", "8"); |
|
344 printarray("17", "7"); |
|
345 |
|
346 #print %rangecount; |
|
347 |
|
348 ###################################################################### |
|
349 # |
|
350 # Close files |
|
351 # |
|
352 ###################################################################### |
|
353 close(HEADER); |
|
354 close(CLASS); |
|
355 close(OUT); |
|
356 |