intl/chardet/tools/charfreq.pl

branch
TOR_BUG_9701
changeset 8
97036ab72558
equal deleted inserted replaced
-1:000000000000 0:719bf73e02b8
1 #!/usr/bin/perl
2 #!/usr/bin/perl
3 #
4 # This Source Code Form is subject to the terms of the Mozilla Public
5 # License, v. 2.0. If a copy of the MPL was not distributed with this
6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 open (STAT,$ARGV[0]) || die " cannot open data file $ARGV[0]\n";
8 @count;
9 while(<STAT>)
10 {
11 @k = split(/\s+/, $_);
12 $count{$k[0]} = $k[1];
13 }
14 $count = 0;
15 while(<STDIN>)
16 {
17 @ck = split /\s*/, $_;
18 $s = 0;
19 $fb = 0;
20 $cl = $#ck;
21 $j = 0;
22 while($j < $cl) {
23 $cc = unpack("C", $ck[$j]);
24 if(0 eq $s ) {
25 if($cc > 0x80) {
26 if($cc > 0xa0) {
27 $fb = $ck[$j];
28 $s = 2;
29 } else {
30 $s = 1;
31 }
32 }
33 } elsif (1 eq $s) {
34 } else {
35 if($cc > 0xa0) {
36 $fb .= $ck[$j];
37 $count{$fb}++;
38 print $fb . " " .$count{$fb} . "\n";
39 $s = 0;
40 } else {
41 $s = 1;
42 }
43 }
44 $j = $j + 1;
45 }
46 }
47 foreach $c (sort(keys( %count )))
48 {
49 print $c . " ". $count{$c} . "\n";
50 }

mercurial