1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/chardet/tools/genutf8.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,189 @@ 1.4 +#!/usr/local/bin/perl 1.5 +# This Source Code Form is subject to the terms of the Mozilla Public 1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.8 + 1.9 +use strict; 1.10 +require "genverifier.pm"; 1.11 +use genverifier; 1.12 + 1.13 + 1.14 +my(@utf8_cls); 1.15 +my(@utf8_st); 1.16 +my($utf8_ver); 1.17 + 1.18 +# 1.19 +# 1.20 +# UTF8 encode the UCS4 into 1 to 4 bytes 1.21 +# 1.22 +# 1 byte 00 00 00 00 00 00 00 7f 1.23 +# 2 bytes 00 00 00 80 00 00 07 ff 1.24 +# 3 bytes 00 00 08 00 00 00 ff ff 1.25 +# 4 bytes 00 01 00 00 00 10 ff ff 1.26 +# 1.27 +# However, since Surrogate area should not be encoded into UTF8 as 1.28 +# a Surrogate pair, we can remove the surrogate area from UTF8 1.29 +# 1.30 +# 1 byte 00 00 00 00 00 00 00 7f 1.31 +# 2 bytes 00 00 00 80 00 00 07 ff 1.32 +# 3 bytes 00 00 08 00 00 00 d7 ff 1.33 +# 00 00 e0 00 00 00 ff ff 1.34 +# 4 bytes 00 01 00 00 00 10 ff ff 1.35 +# 1.36 +# Now we break them into 6 bits group for 2-4 bytes UTF8 1.37 +# 1.38 +# 1 byte 00 7f 1.39 +# 2 bytes 02 00 1f 3f 1.40 +# 3 bytes 00 20 00 0d 1f 3f 1.41 +# 0e 00 00 0f 3f 3f 1.42 +# 4 bytes 00 10 00 00 04 0f 3f 3f 1.43 +# 1.44 +# Break down more 1.45 +# 1.46 +# 1 byte 00 7f 1.47 +# 2 bytes 02 00 1f 3f 1.48 +# 3 bytes 00 20 00 00 3f 3f 1.49 +# 01 00 00 0c 3f 3f 1.50 +# 0d 00 00 0d 1f 3f 1.51 +# 0e 00 00 0f 3f 3f 1.52 +# 4 bytes 00 10 00 00 00 3f 3f 3f 1.53 +# 01 00 00 00 03 3f 3f 3f 1.54 +# 04 00 00 00 04 0f 3f 3f 1.55 +# 1.56 +# Now, add 1.57 +# c0 to the lead byte of 2 bytes UTF8 1.58 +# e0 to the lead byte of 3 bytes UTF8 1.59 +# f0 to the lead byte of 4 bytes UTF8 1.60 +# 80 to the trail bytes 1.61 +# 1.62 +# 1 byte 00 7f 1.63 +# 2 bytes c2 80 df bf 1.64 +# 3 bytes e0 a0 80 e0 bf bf 1.65 +# e1 80 80 ec bf bf 1.66 +# ed 80 80 ed 9f bf 1.67 +# ee 80 80 ef bf bf 1.68 +# 4 bytes f0 90 80 80 f0 bf bf bf 1.69 +# f1 80 80 80 f3 bf bf bf 1.70 +# f4 80 80 80 f4 8f bf bf 1.71 +# 1.72 +# 1.73 +# Now we can construct our state diagram 1.74 +# 1.75 +# 0:0x0e,0x0f,0x1b->Error 1.76 +# 0:[0-0x7f]->0 1.77 +# 0:[c2-df]->3 1.78 +# 0:e0->4 1.79 +# 0:[e1-ec, ee-ef]->5 1.80 +# 0:ed->6 1.81 +# 0:f0->7 1.82 +# 0:[f1-f3]->8 1.83 +# 0:f4->9 1.84 +# 0:*->Error 1.85 +# 3:[80-bf]->0 1.86 +# 3:*->Error 1.87 +# 4:[a0-bf]->3 1.88 +# 4:*->Error 1.89 +# 5:[80-bf]->3 1.90 +# 5:*->Error 1.91 +# 6:[80-9f]->3 1.92 +# 6:*->Error 1.93 +# 7:[90-bf]->5 1.94 +# 7:*->Error 1.95 +# 8:[80-bf]->5 1.96 +# 8:*->Error 1.97 +# 9:[80-8f]->5 1.98 +# 9:*->Error 1.99 +# 1.100 +# Now, we classified chars into class 1.101 +# 1.102 +# 00,0e,0f,1b:k0 1.103 +# 01-0d,10-1a,1c-7f:k1 1.104 +# 80-8f:k2 1.105 +# 90-9f:k3 1.106 +# a0-bf:k4 1.107 +# c0-c1:k0 1.108 +# c2-df:k5 1.109 +# e0:k6 1.110 +# e1-ec:k7 1.111 +# ed:k8 1.112 +# ee-ef:k7 1.113 +# f0:k9 1.114 +# f1-f3:k10 1.115 +# f4:k11 1.116 +# f5-ff:k0 1.117 +# 1.118 +# Now, let's put them into array form 1.119 + 1.120 +@utf8_cls = ( 1.121 + [ 0x00 , 0x00 , 1 ], 1.122 + [ 0x0e , 0x0f , 0 ], 1.123 + [ 0x1b , 0x1b , 0 ], 1.124 + [ 0x01 , 0x0d , 1 ], 1.125 + [ 0x10 , 0x1a , 1 ], 1.126 + [ 0x1c , 0x7f , 1 ], 1.127 + [ 0x80 , 0x8f , 2 ], 1.128 + [ 0x90 , 0x9f , 3 ], 1.129 + [ 0xa0 , 0xbf , 4 ], 1.130 + [ 0xc0 , 0xc1 , 0 ], 1.131 + [ 0xc2 , 0xdf , 5 ], 1.132 + [ 0xe0 , 0xe0 , 6 ], 1.133 + [ 0xe1 , 0xec , 7 ], 1.134 + [ 0xed , 0xed , 8 ], 1.135 + [ 0xee , 0xef , 7 ], 1.136 + [ 0xf0 , 0xf0 , 9 ], 1.137 + [ 0xf1 , 0xf3 , 10 ], 1.138 + [ 0xf4 , 0xf4 , 11 ], 1.139 + [ 0xf5 , 0xff , 0 ], 1.140 +); 1.141 +# 1.142 +# Now, we write the state diagram in class 1.143 +# 1.144 +# 0:k0->Error 1.145 +# 0:k1->0 1.146 +# 0:k5->3 1.147 +# 0:k6->4 1.148 +# 0:k7->5 1.149 +# 0:k8->6 1.150 +# 0:k9->7 1.151 +# 0:k10->8 1.152 +# 0:k11->9 1.153 +# 0:*->Error 1.154 +# 3:k2,k3,k4->0 1.155 +# 3:*->Error 1.156 +# 4:k4->3 1.157 +# 4:*->Error 1.158 +# 5:k2,k3,k4->3 1.159 +# 5:*->Error 1.160 +# 6:k2,k3->3 1.161 +# 6:*->Error 1.162 +# 7:k3,k4->5 1.163 +# 7:*->Error 1.164 +# 8:k2,k3,k4->5 1.165 +# 8:*->Error 1.166 +# 9:k2->5 1.167 +# 9:*->Error 1.168 +# 1.169 +# Now, let's put them into array 1.170 +# 1.171 +package genverifier; 1.172 +@utf8_st = ( 1.173 +# 0 1 2 3 4 5 6 7 8 9 10 11 1.174 + 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start 1.175 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error 1.176 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe 1.177 + 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 1.178 + 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 1.179 + 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 1.180 + 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 1.181 + 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 1.182 + 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 1.183 + 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 1.184 +); 1.185 + 1.186 + 1.187 + 1.188 +$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); 1.189 +print $utf8_ver; 1.190 + 1.191 + 1.192 +