intl/chardet/tools/genutf8.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 #!/usr/local/bin/perl
     2 # This Source Code Form is subject to the terms of the Mozilla Public
     3 # License, v. 2.0. If a copy of the MPL was not distributed with this
     4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     6 use strict;
     7 require "genverifier.pm";
     8 use genverifier;
    11 my(@utf8_cls);
    12 my(@utf8_st);
    13 my($utf8_ver);
    15 #
    16 #
    17 # UTF8 encode the UCS4 into 1 to 4 bytes
    18 #
    19 # 1 byte    00 00 00 00   00 00 00 7f
    20 # 2 bytes   00 00 00 80   00 00 07 ff
    21 # 3 bytes   00 00 08 00   00 00 ff ff
    22 # 4 bytes   00 01 00 00   00 10 ff ff
    23 #
    24 # However, since Surrogate area should not be encoded into UTF8 as
    25 # a Surrogate pair, we can remove the surrogate area from UTF8
    26 #
    27 # 1 byte    00 00 00 00   00 00 00 7f
    28 # 2 bytes   00 00 00 80   00 00 07 ff
    29 # 3 bytes   00 00 08 00   00 00 d7 ff
    30 #           00 00 e0 00   00 00 ff ff
    31 # 4 bytes   00 01 00 00   00 10 ff ff
    32 #
    33 # Now we break them into 6 bits group for 2-4 bytes UTF8
    34 #
    35 # 1 byte                   00                  7f
    36 # 2 bytes               02 00               1f 3f
    37 # 3 bytes            00 20 00            0d 1f 3f
    38 #                    0e 00 00            0f 3f 3f
    39 # 4 bytes         00 10 00 00         04 0f 3f 3f
    40 #
    41 # Break down more
    42 #
    43 # 1 byte                   00                  7f
    44 # 2 bytes               02 00               1f 3f
    45 # 3 bytes            00 20 00            00 3f 3f
    46 #                    01 00 00            0c 3f 3f
    47 #                    0d 00 00            0d 1f 3f
    48 #                    0e 00 00            0f 3f 3f
    49 # 4 bytes         00 10 00 00         00 3f 3f 3f
    50 #                 01 00 00 00         03 3f 3f 3f
    51 #                 04 00 00 00         04 0f 3f 3f
    52 #
    53 # Now, add
    54 #  c0 to the lead byte of 2 bytes UTF8
    55 #  e0 to the lead byte of 3 bytes UTF8
    56 #  f0 to the lead byte of 4 bytes UTF8
    57 #  80 to the trail bytes
    58 #
    59 # 1 byte                   00                  7f
    60 # 2 bytes               c2 80               df bf
    61 # 3 bytes            e0 a0 80            e0 bf bf
    62 #                    e1 80 80            ec bf bf
    63 #                    ed 80 80            ed 9f bf
    64 #                    ee 80 80            ef bf bf
    65 # 4 bytes         f0 90 80 80         f0 bf bf bf
    66 #                 f1 80 80 80         f3 bf bf bf
    67 #                 f4 80 80 80         f4 8f bf bf
    68 #
    69 #
    70 # Now we can construct our state diagram
    71 #
    72 # 0:0x0e,0x0f,0x1b->Error
    73 # 0:[0-0x7f]->0
    74 # 0:[c2-df]->3
    75 # 0:e0->4
    76 # 0:[e1-ec, ee-ef]->5
    77 # 0:ed->6
    78 # 0:f0->7
    79 # 0:[f1-f3]->8
    80 # 0:f4->9
    81 # 0:*->Error
    82 # 3:[80-bf]->0
    83 # 3:*->Error
    84 # 4:[a0-bf]->3
    85 # 4:*->Error
    86 # 5:[80-bf]->3
    87 # 5:*->Error
    88 # 6:[80-9f]->3
    89 # 6:*->Error
    90 # 7:[90-bf]->5
    91 # 7:*->Error
    92 # 8:[80-bf]->5
    93 # 8:*->Error
    94 # 9:[80-8f]->5
    95 # 9:*->Error
    96 #
    97 # Now, we classified chars into class
    98 #
    99 # 00,0e,0f,1b:k0
   100 # 01-0d,10-1a,1c-7f:k1
   101 # 80-8f:k2
   102 # 90-9f:k3
   103 # a0-bf:k4
   104 # c0-c1:k0
   105 # c2-df:k5
   106 # e0:k6
   107 # e1-ec:k7
   108 # ed:k8
   109 # ee-ef:k7
   110 # f0:k9
   111 # f1-f3:k10
   112 # f4:k11
   113 # f5-ff:k0
   114 #
   115 # Now, let's put them into array form
   117 @utf8_cls = (
   118  [ 0x00 , 0x00 , 1 ],
   119  [ 0x0e , 0x0f , 0 ],
   120  [ 0x1b , 0x1b , 0 ],
   121  [ 0x01 , 0x0d , 1 ],
   122  [ 0x10 , 0x1a , 1 ],
   123  [ 0x1c , 0x7f , 1 ],
   124  [ 0x80 , 0x8f , 2 ],
   125  [ 0x90 , 0x9f , 3 ],
   126  [ 0xa0 , 0xbf , 4 ],
   127  [ 0xc0 , 0xc1 , 0 ],
   128  [ 0xc2 , 0xdf , 5 ],
   129  [ 0xe0 , 0xe0 , 6 ],
   130  [ 0xe1 , 0xec , 7 ],
   131  [ 0xed , 0xed , 8 ],
   132  [ 0xee , 0xef , 7 ],
   133  [ 0xf0 , 0xf0 , 9 ],
   134  [ 0xf1 , 0xf3 , 10 ],
   135  [ 0xf4 , 0xf4 , 11 ],
   136  [ 0xf5 , 0xff , 0 ],
   137 );
   138 #
   139 # Now, we write the state diagram in class
   140 #
   141 # 0:k0->Error
   142 # 0:k1->0
   143 # 0:k5->3
   144 # 0:k6->4
   145 # 0:k7->5
   146 # 0:k8->6
   147 # 0:k9->7
   148 # 0:k10->8
   149 # 0:k11->9
   150 # 0:*->Error
   151 # 3:k2,k3,k4->0
   152 # 3:*->Error
   153 # 4:k4->3
   154 # 4:*->Error
   155 # 5:k2,k3,k4->3
   156 # 5:*->Error
   157 # 6:k2,k3->3
   158 # 6:*->Error
   159 # 7:k3,k4->5
   160 # 7:*->Error
   161 # 8:k2,k3,k4->5
   162 # 8:*->Error
   163 # 9:k2->5
   164 # 9:*->Error
   165 #
   166 # Now, let's put them into array
   167 #
   168 package genverifier;
   169 @utf8_st = (
   170 # 0  1  2  3  4  5  6  7  8  9 10 11
   171   1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start
   172   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
   173   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
   174   1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3
   175   1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4
   176   1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5
   177   1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
   178   1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
   179   1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8
   180   1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
   181 );
   185 $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12,     \@utf8_st);
   186 print $utf8_ver;

mercurial