intl/chardet/tools/charfreqtostat.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/chardet/tools/charfreqtostat.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,95 @@
     1.4 +#!/usr/bin/perl
     1.5 +#
     1.6 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.9 +sub GenNPL {
    1.10 +  my($ret) = << "END_NPL";
    1.11 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
    1.12 +/* This Source Code Form is subject to the terms of the Mozilla Public
    1.13 + * License, v. 2.0. If a copy of the MPL was not distributed with this
    1.14 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
    1.15 +END_NPL
    1.16 +
    1.17 +  return $ret;
    1.18 +}
    1.19 +
    1.20 +print GenNPL();
    1.21 +$total=0;
    1.22 +@h;
    1.23 +@l;
    1.24 +
    1.25 +while(<STDIN>)
    1.26 +{
    1.27 +   @k = split(/\s+/, $_);
    1.28 +  @i = unpack("CCCC", $k[0]);
    1.29 +#  printf("%x %x %s",$i[0] ,  $i[1] , "[" . $k[0] . "]   " . $i . "  " . $j . "  " . $k[1]  ."\n");
    1.30 +  if((0xA1 <= $i[0]) && (0xA1 <= $i[1])){
    1.31 +  $total += $k[1];
    1.32 +     $v = $i[0] - 0x00A1;
    1.33 +     $h[$v] += $k[1];
    1.34 +     $u = $i[1] - 0x00A1;
    1.35 +     $l[$u] += $k[1];
    1.36 +#     print "hello $v $h[$v] $u $l[$u]\n";
    1.37 +  }
    1.38 +}
    1.39 +
    1.40 +
    1.41 +$ffh = 0.0;
    1.42 +$ffl = 0.0;
    1.43 +for($i=0x00A1;$i< 0x00FF ; $i++)
    1.44 +{
    1.45 +     $fh[$i - 0x00a1] = $h[$i- 0x00a1] / $total;
    1.46 +     $ffh += $fh[$i - 0x00a1];
    1.47 +
    1.48 +     $fl[$i - 0x00a1] = $l[$i- 0x00a1] / $total;
    1.49 +     $ffl += $fl[$i - 0x00a1];
    1.50 +}
    1.51 +$mh = $ffh / 94.0;
    1.52 +$ml = $ffl / 94.0;
    1.53 +
    1.54 +$sumh=0.0;
    1.55 +$suml=0.0;
    1.56 +for($i=0x00A1;$i< 0x00FF ; $i++)
    1.57 +{
    1.58 +     $sh = $fh[$i - 0x00a1] - $mh;
    1.59 +     $sh *= $sh;
    1.60 +     $sumh += $sh;
    1.61 +
    1.62 +     $sl = $fl[$i - 0x00a1] - $ml;
    1.63 +     $sl *= $sl;
    1.64 +     $suml += $sl;
    1.65 +}
    1.66 +$sumh /= 94.0;
    1.67 +$suml /= 94.0;
    1.68 +$stdh = sqrt($sumh);
    1.69 +$stdl = sqrt($suml);
    1.70 +
    1.71 +print "{\n";
    1.72 +print "  {\n";
    1.73 +for($i=0x00A1;$i< 0x00FF ; $i++)
    1.74 +{
    1.75 +   if($i eq 0xfe) {
    1.76 +     printf("   %.6ff  \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
    1.77 +   } else {
    1.78 +     printf("   %.6ff, \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
    1.79 +   }
    1.80 +}
    1.81 +print "  },\n";
    1.82 +printf ("%.6ff, \/\/ Lead Byte StdDev\n", $stdh);
    1.83 +printf ("%.6ff, \/\/ Lead Byte Mean\n", $mh);
    1.84 +printf ("%.6ff, \/\/ Lead Byte Weight\n", $stdh / ($stdh + $stdl));
    1.85 +print "  {\n";
    1.86 +for($i=0x00A1;$i< 0x00FF ; $i++)
    1.87 +{
    1.88 +   if($i eq 0xfe) {
    1.89 +     printf("  %.6ff  \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] ,  $i);
    1.90 +   } else {
    1.91 +     printf("  %.6ff, \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] ,  $i);
    1.92 +   }
    1.93 +}
    1.94 +print "  },\n";
    1.95 +printf ("%.6ff, \/\/ Trail Byte StdDev\n", $stdl);
    1.96 +printf ("%.6ff, \/\/ Trail Byte Mean\n", $ml);
    1.97 +printf ("%.6ff  \/\/ Trial Byte Weight\n", $stdl / ($stdh + $stdl));
    1.98 +print "};\n";

mercurial