|
1 #!/usr/local/bin/perl |
|
2 # This Source Code Form is subject to the terms of the Mozilla Public |
|
3 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5 |
|
6 use strict; |
|
7 require "genverifier.pm"; |
|
8 use genverifier; |
|
9 |
|
10 |
|
11 my(@utf8_cls); |
|
12 my(@utf8_st); |
|
13 my($utf8_ver); |
|
14 |
|
15 # |
|
16 # |
|
17 # UTF8 encode the UCS4 into 1 to 4 bytes |
|
18 # |
|
19 # 1 byte 00 00 00 00 00 00 00 7f |
|
20 # 2 bytes 00 00 00 80 00 00 07 ff |
|
21 # 3 bytes 00 00 08 00 00 00 ff ff |
|
22 # 4 bytes 00 01 00 00 00 10 ff ff |
|
23 # |
|
24 # However, since Surrogate area should not be encoded into UTF8 as |
|
25 # a Surrogate pair, we can remove the surrogate area from UTF8 |
|
26 # |
|
27 # 1 byte 00 00 00 00 00 00 00 7f |
|
28 # 2 bytes 00 00 00 80 00 00 07 ff |
|
29 # 3 bytes 00 00 08 00 00 00 d7 ff |
|
30 # 00 00 e0 00 00 00 ff ff |
|
31 # 4 bytes 00 01 00 00 00 10 ff ff |
|
32 # |
|
33 # Now we break them into 6 bits group for 2-4 bytes UTF8 |
|
34 # |
|
35 # 1 byte 00 7f |
|
36 # 2 bytes 02 00 1f 3f |
|
37 # 3 bytes 00 20 00 0d 1f 3f |
|
38 # 0e 00 00 0f 3f 3f |
|
39 # 4 bytes 00 10 00 00 04 0f 3f 3f |
|
40 # |
|
41 # Break down more |
|
42 # |
|
43 # 1 byte 00 7f |
|
44 # 2 bytes 02 00 1f 3f |
|
45 # 3 bytes 00 20 00 00 3f 3f |
|
46 # 01 00 00 0c 3f 3f |
|
47 # 0d 00 00 0d 1f 3f |
|
48 # 0e 00 00 0f 3f 3f |
|
49 # 4 bytes 00 10 00 00 00 3f 3f 3f |
|
50 # 01 00 00 00 03 3f 3f 3f |
|
51 # 04 00 00 00 04 0f 3f 3f |
|
52 # |
|
53 # Now, add |
|
54 # c0 to the lead byte of 2 bytes UTF8 |
|
55 # e0 to the lead byte of 3 bytes UTF8 |
|
56 # f0 to the lead byte of 4 bytes UTF8 |
|
57 # 80 to the trail bytes |
|
58 # |
|
59 # 1 byte 00 7f |
|
60 # 2 bytes c2 80 df bf |
|
61 # 3 bytes e0 a0 80 e0 bf bf |
|
62 # e1 80 80 ec bf bf |
|
63 # ed 80 80 ed 9f bf |
|
64 # ee 80 80 ef bf bf |
|
65 # 4 bytes f0 90 80 80 f0 bf bf bf |
|
66 # f1 80 80 80 f3 bf bf bf |
|
67 # f4 80 80 80 f4 8f bf bf |
|
68 # |
|
69 # |
|
70 # Now we can construct our state diagram |
|
71 # |
|
72 # 0:0x0e,0x0f,0x1b->Error |
|
73 # 0:[0-0x7f]->0 |
|
74 # 0:[c2-df]->3 |
|
75 # 0:e0->4 |
|
76 # 0:[e1-ec, ee-ef]->5 |
|
77 # 0:ed->6 |
|
78 # 0:f0->7 |
|
79 # 0:[f1-f3]->8 |
|
80 # 0:f4->9 |
|
81 # 0:*->Error |
|
82 # 3:[80-bf]->0 |
|
83 # 3:*->Error |
|
84 # 4:[a0-bf]->3 |
|
85 # 4:*->Error |
|
86 # 5:[80-bf]->3 |
|
87 # 5:*->Error |
|
88 # 6:[80-9f]->3 |
|
89 # 6:*->Error |
|
90 # 7:[90-bf]->5 |
|
91 # 7:*->Error |
|
92 # 8:[80-bf]->5 |
|
93 # 8:*->Error |
|
94 # 9:[80-8f]->5 |
|
95 # 9:*->Error |
|
96 # |
|
97 # Now, we classified chars into class |
|
98 # |
|
99 # 00,0e,0f,1b:k0 |
|
100 # 01-0d,10-1a,1c-7f:k1 |
|
101 # 80-8f:k2 |
|
102 # 90-9f:k3 |
|
103 # a0-bf:k4 |
|
104 # c0-c1:k0 |
|
105 # c2-df:k5 |
|
106 # e0:k6 |
|
107 # e1-ec:k7 |
|
108 # ed:k8 |
|
109 # ee-ef:k7 |
|
110 # f0:k9 |
|
111 # f1-f3:k10 |
|
112 # f4:k11 |
|
113 # f5-ff:k0 |
|
114 # |
|
115 # Now, let's put them into array form |
|
116 |
|
117 @utf8_cls = ( |
|
118 [ 0x00 , 0x00 , 1 ], |
|
119 [ 0x0e , 0x0f , 0 ], |
|
120 [ 0x1b , 0x1b , 0 ], |
|
121 [ 0x01 , 0x0d , 1 ], |
|
122 [ 0x10 , 0x1a , 1 ], |
|
123 [ 0x1c , 0x7f , 1 ], |
|
124 [ 0x80 , 0x8f , 2 ], |
|
125 [ 0x90 , 0x9f , 3 ], |
|
126 [ 0xa0 , 0xbf , 4 ], |
|
127 [ 0xc0 , 0xc1 , 0 ], |
|
128 [ 0xc2 , 0xdf , 5 ], |
|
129 [ 0xe0 , 0xe0 , 6 ], |
|
130 [ 0xe1 , 0xec , 7 ], |
|
131 [ 0xed , 0xed , 8 ], |
|
132 [ 0xee , 0xef , 7 ], |
|
133 [ 0xf0 , 0xf0 , 9 ], |
|
134 [ 0xf1 , 0xf3 , 10 ], |
|
135 [ 0xf4 , 0xf4 , 11 ], |
|
136 [ 0xf5 , 0xff , 0 ], |
|
137 ); |
|
138 # |
|
139 # Now, we write the state diagram in class |
|
140 # |
|
141 # 0:k0->Error |
|
142 # 0:k1->0 |
|
143 # 0:k5->3 |
|
144 # 0:k6->4 |
|
145 # 0:k7->5 |
|
146 # 0:k8->6 |
|
147 # 0:k9->7 |
|
148 # 0:k10->8 |
|
149 # 0:k11->9 |
|
150 # 0:*->Error |
|
151 # 3:k2,k3,k4->0 |
|
152 # 3:*->Error |
|
153 # 4:k4->3 |
|
154 # 4:*->Error |
|
155 # 5:k2,k3,k4->3 |
|
156 # 5:*->Error |
|
157 # 6:k2,k3->3 |
|
158 # 6:*->Error |
|
159 # 7:k3,k4->5 |
|
160 # 7:*->Error |
|
161 # 8:k2,k3,k4->5 |
|
162 # 8:*->Error |
|
163 # 9:k2->5 |
|
164 # 9:*->Error |
|
165 # |
|
166 # Now, let's put them into array |
|
167 # |
|
168 package genverifier; |
|
169 @utf8_st = ( |
|
170 # 0 1 2 3 4 5 6 7 8 9 10 11 |
|
171 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start |
|
172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error |
|
173 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe |
|
174 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 |
|
175 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 |
|
176 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 |
|
177 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 |
|
178 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 |
|
179 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 |
|
180 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 |
|
181 ); |
|
182 |
|
183 |
|
184 |
|
185 $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); |
|
186 print $utf8_ver; |
|
187 |
|
188 |
|
189 |