intl/chardet/tools/genutf8.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/chardet/tools/genutf8.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,189 @@
     1.4 +#!/usr/local/bin/perl
     1.5 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.8 +
     1.9 +use strict;
    1.10 +require "genverifier.pm";
    1.11 +use genverifier;
    1.12 +
    1.13 +
    1.14 +my(@utf8_cls);
    1.15 +my(@utf8_st);
    1.16 +my($utf8_ver);
    1.17 +
    1.18 +#
    1.19 +#
    1.20 +# UTF8 encode the UCS4 into 1 to 4 bytes
    1.21 +#
    1.22 +# 1 byte    00 00 00 00   00 00 00 7f
    1.23 +# 2 bytes   00 00 00 80   00 00 07 ff
    1.24 +# 3 bytes   00 00 08 00   00 00 ff ff
    1.25 +# 4 bytes   00 01 00 00   00 10 ff ff
    1.26 +#
    1.27 +# However, since Surrogate area should not be encoded into UTF8 as
    1.28 +# a Surrogate pair, we can remove the surrogate area from UTF8
    1.29 +#
    1.30 +# 1 byte    00 00 00 00   00 00 00 7f
    1.31 +# 2 bytes   00 00 00 80   00 00 07 ff
    1.32 +# 3 bytes   00 00 08 00   00 00 d7 ff
    1.33 +#           00 00 e0 00   00 00 ff ff
    1.34 +# 4 bytes   00 01 00 00   00 10 ff ff
    1.35 +#
    1.36 +# Now we break them into 6 bits group for 2-4 bytes UTF8
    1.37 +#
    1.38 +# 1 byte                   00                  7f
    1.39 +# 2 bytes               02 00               1f 3f
    1.40 +# 3 bytes            00 20 00            0d 1f 3f
    1.41 +#                    0e 00 00            0f 3f 3f
    1.42 +# 4 bytes         00 10 00 00         04 0f 3f 3f
    1.43 +#
    1.44 +# Break down more
    1.45 +#
    1.46 +# 1 byte                   00                  7f
    1.47 +# 2 bytes               02 00               1f 3f
    1.48 +# 3 bytes            00 20 00            00 3f 3f
    1.49 +#                    01 00 00            0c 3f 3f
    1.50 +#                    0d 00 00            0d 1f 3f
    1.51 +#                    0e 00 00            0f 3f 3f
    1.52 +# 4 bytes         00 10 00 00         00 3f 3f 3f
    1.53 +#                 01 00 00 00         03 3f 3f 3f
    1.54 +#                 04 00 00 00         04 0f 3f 3f
    1.55 +#
    1.56 +# Now, add
    1.57 +#  c0 to the lead byte of 2 bytes UTF8
    1.58 +#  e0 to the lead byte of 3 bytes UTF8
    1.59 +#  f0 to the lead byte of 4 bytes UTF8
    1.60 +#  80 to the trail bytes
    1.61 +#
    1.62 +# 1 byte                   00                  7f
    1.63 +# 2 bytes               c2 80               df bf
    1.64 +# 3 bytes            e0 a0 80            e0 bf bf
    1.65 +#                    e1 80 80            ec bf bf
    1.66 +#                    ed 80 80            ed 9f bf
    1.67 +#                    ee 80 80            ef bf bf
    1.68 +# 4 bytes         f0 90 80 80         f0 bf bf bf
    1.69 +#                 f1 80 80 80         f3 bf bf bf
    1.70 +#                 f4 80 80 80         f4 8f bf bf
    1.71 +#
    1.72 +#
    1.73 +# Now we can construct our state diagram
    1.74 +#
    1.75 +# 0:0x0e,0x0f,0x1b->Error
    1.76 +# 0:[0-0x7f]->0
    1.77 +# 0:[c2-df]->3
    1.78 +# 0:e0->4
    1.79 +# 0:[e1-ec, ee-ef]->5
    1.80 +# 0:ed->6
    1.81 +# 0:f0->7
    1.82 +# 0:[f1-f3]->8
    1.83 +# 0:f4->9
    1.84 +# 0:*->Error
    1.85 +# 3:[80-bf]->0
    1.86 +# 3:*->Error
    1.87 +# 4:[a0-bf]->3
    1.88 +# 4:*->Error
    1.89 +# 5:[80-bf]->3
    1.90 +# 5:*->Error
    1.91 +# 6:[80-9f]->3
    1.92 +# 6:*->Error
    1.93 +# 7:[90-bf]->5
    1.94 +# 7:*->Error
    1.95 +# 8:[80-bf]->5
    1.96 +# 8:*->Error
    1.97 +# 9:[80-8f]->5
    1.98 +# 9:*->Error
    1.99 +#
   1.100 +# Now, we classified chars into class
   1.101 +#
   1.102 +# 00,0e,0f,1b:k0
   1.103 +# 01-0d,10-1a,1c-7f:k1
   1.104 +# 80-8f:k2
   1.105 +# 90-9f:k3
   1.106 +# a0-bf:k4
   1.107 +# c0-c1:k0
   1.108 +# c2-df:k5
   1.109 +# e0:k6
   1.110 +# e1-ec:k7
   1.111 +# ed:k8
   1.112 +# ee-ef:k7
   1.113 +# f0:k9
   1.114 +# f1-f3:k10
   1.115 +# f4:k11
   1.116 +# f5-ff:k0
   1.117 +#
   1.118 +# Now, let's put them into array form
   1.119 +
   1.120 +@utf8_cls = (
   1.121 + [ 0x00 , 0x00 , 1 ],
   1.122 + [ 0x0e , 0x0f , 0 ],
   1.123 + [ 0x1b , 0x1b , 0 ],
   1.124 + [ 0x01 , 0x0d , 1 ],
   1.125 + [ 0x10 , 0x1a , 1 ],
   1.126 + [ 0x1c , 0x7f , 1 ],
   1.127 + [ 0x80 , 0x8f , 2 ],
   1.128 + [ 0x90 , 0x9f , 3 ],
   1.129 + [ 0xa0 , 0xbf , 4 ],
   1.130 + [ 0xc0 , 0xc1 , 0 ],
   1.131 + [ 0xc2 , 0xdf , 5 ],
   1.132 + [ 0xe0 , 0xe0 , 6 ],
   1.133 + [ 0xe1 , 0xec , 7 ],
   1.134 + [ 0xed , 0xed , 8 ],
   1.135 + [ 0xee , 0xef , 7 ],
   1.136 + [ 0xf0 , 0xf0 , 9 ],
   1.137 + [ 0xf1 , 0xf3 , 10 ],
   1.138 + [ 0xf4 , 0xf4 , 11 ],
   1.139 + [ 0xf5 , 0xff , 0 ],
   1.140 +);
   1.141 +#
   1.142 +# Now, we write the state diagram in class
   1.143 +#
   1.144 +# 0:k0->Error
   1.145 +# 0:k1->0
   1.146 +# 0:k5->3
   1.147 +# 0:k6->4
   1.148 +# 0:k7->5
   1.149 +# 0:k8->6
   1.150 +# 0:k9->7
   1.151 +# 0:k10->8
   1.152 +# 0:k11->9
   1.153 +# 0:*->Error
   1.154 +# 3:k2,k3,k4->0
   1.155 +# 3:*->Error
   1.156 +# 4:k4->3
   1.157 +# 4:*->Error
   1.158 +# 5:k2,k3,k4->3
   1.159 +# 5:*->Error
   1.160 +# 6:k2,k3->3
   1.161 +# 6:*->Error
   1.162 +# 7:k3,k4->5
   1.163 +# 7:*->Error
   1.164 +# 8:k2,k3,k4->5
   1.165 +# 8:*->Error
   1.166 +# 9:k2->5
   1.167 +# 9:*->Error
   1.168 +#
   1.169 +# Now, let's put them into array
   1.170 +#
   1.171 +package genverifier;
   1.172 +@utf8_st = (
   1.173 +# 0  1  2  3  4  5  6  7  8  9 10 11
   1.174 +  1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start
   1.175 +  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
   1.176 +  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
   1.177 +  1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3
   1.178 +  1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4
   1.179 +  1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5
   1.180 +  1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
   1.181 +  1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
   1.182 +  1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8
   1.183 +  1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
   1.184 +);
   1.185 +
   1.186 +
   1.187 +
   1.188 +$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12,     \@utf8_st);
   1.189 +print $utf8_ver;
   1.190 +
   1.191 +
   1.192 +

mercurial