Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | #!/usr/local/bin/perl |
michael@0 | 2 | # This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
michael@0 | 5 | |
michael@0 | 6 | use strict; |
michael@0 | 7 | require "genverifier.pm"; |
michael@0 | 8 | use genverifier; |
michael@0 | 9 | |
michael@0 | 10 | |
michael@0 | 11 | my(@utf8_cls); |
michael@0 | 12 | my(@utf8_st); |
michael@0 | 13 | my($utf8_ver); |
michael@0 | 14 | |
michael@0 | 15 | # |
michael@0 | 16 | # |
michael@0 | 17 | # UTF8 encode the UCS4 into 1 to 4 bytes |
michael@0 | 18 | # |
michael@0 | 19 | # 1 byte 00 00 00 00 00 00 00 7f |
michael@0 | 20 | # 2 bytes 00 00 00 80 00 00 07 ff |
michael@0 | 21 | # 3 bytes 00 00 08 00 00 00 ff ff |
michael@0 | 22 | # 4 bytes 00 01 00 00 00 10 ff ff |
michael@0 | 23 | # |
michael@0 | 24 | # However, since Surrogate area should not be encoded into UTF8 as |
michael@0 | 25 | # a Surrogate pair, we can remove the surrogate area from UTF8 |
michael@0 | 26 | # |
michael@0 | 27 | # 1 byte 00 00 00 00 00 00 00 7f |
michael@0 | 28 | # 2 bytes 00 00 00 80 00 00 07 ff |
michael@0 | 29 | # 3 bytes 00 00 08 00 00 00 d7 ff |
michael@0 | 30 | # 00 00 e0 00 00 00 ff ff |
michael@0 | 31 | # 4 bytes 00 01 00 00 00 10 ff ff |
michael@0 | 32 | # |
michael@0 | 33 | # Now we break them into 6 bits group for 2-4 bytes UTF8 |
michael@0 | 34 | # |
michael@0 | 35 | # 1 byte 00 7f |
michael@0 | 36 | # 2 bytes 02 00 1f 3f |
michael@0 | 37 | # 3 bytes 00 20 00 0d 1f 3f |
michael@0 | 38 | # 0e 00 00 0f 3f 3f |
michael@0 | 39 | # 4 bytes 00 10 00 00 04 0f 3f 3f |
michael@0 | 40 | # |
michael@0 | 41 | # Break down more |
michael@0 | 42 | # |
michael@0 | 43 | # 1 byte 00 7f |
michael@0 | 44 | # 2 bytes 02 00 1f 3f |
michael@0 | 45 | # 3 bytes 00 20 00 00 3f 3f |
michael@0 | 46 | # 01 00 00 0c 3f 3f |
michael@0 | 47 | # 0d 00 00 0d 1f 3f |
michael@0 | 48 | # 0e 00 00 0f 3f 3f |
michael@0 | 49 | # 4 bytes 00 10 00 00 00 3f 3f 3f |
michael@0 | 50 | # 01 00 00 00 03 3f 3f 3f |
michael@0 | 51 | # 04 00 00 00 04 0f 3f 3f |
michael@0 | 52 | # |
michael@0 | 53 | # Now, add |
michael@0 | 54 | # c0 to the lead byte of 2 bytes UTF8 |
michael@0 | 55 | # e0 to the lead byte of 3 bytes UTF8 |
michael@0 | 56 | # f0 to the lead byte of 4 bytes UTF8 |
michael@0 | 57 | # 80 to the trail bytes |
michael@0 | 58 | # |
michael@0 | 59 | # 1 byte 00 7f |
michael@0 | 60 | # 2 bytes c2 80 df bf |
michael@0 | 61 | # 3 bytes e0 a0 80 e0 bf bf |
michael@0 | 62 | # e1 80 80 ec bf bf |
michael@0 | 63 | # ed 80 80 ed 9f bf |
michael@0 | 64 | # ee 80 80 ef bf bf |
michael@0 | 65 | # 4 bytes f0 90 80 80 f0 bf bf bf |
michael@0 | 66 | # f1 80 80 80 f3 bf bf bf |
michael@0 | 67 | # f4 80 80 80 f4 8f bf bf |
michael@0 | 68 | # |
michael@0 | 69 | # |
michael@0 | 70 | # Now we can construct our state diagram |
michael@0 | 71 | # |
michael@0 | 72 | # 0:0x0e,0x0f,0x1b->Error |
michael@0 | 73 | # 0:[0-0x7f]->0 |
michael@0 | 74 | # 0:[c2-df]->3 |
michael@0 | 75 | # 0:e0->4 |
michael@0 | 76 | # 0:[e1-ec, ee-ef]->5 |
michael@0 | 77 | # 0:ed->6 |
michael@0 | 78 | # 0:f0->7 |
michael@0 | 79 | # 0:[f1-f3]->8 |
michael@0 | 80 | # 0:f4->9 |
michael@0 | 81 | # 0:*->Error |
michael@0 | 82 | # 3:[80-bf]->0 |
michael@0 | 83 | # 3:*->Error |
michael@0 | 84 | # 4:[a0-bf]->3 |
michael@0 | 85 | # 4:*->Error |
michael@0 | 86 | # 5:[80-bf]->3 |
michael@0 | 87 | # 5:*->Error |
michael@0 | 88 | # 6:[80-9f]->3 |
michael@0 | 89 | # 6:*->Error |
michael@0 | 90 | # 7:[90-bf]->5 |
michael@0 | 91 | # 7:*->Error |
michael@0 | 92 | # 8:[80-bf]->5 |
michael@0 | 93 | # 8:*->Error |
michael@0 | 94 | # 9:[80-8f]->5 |
michael@0 | 95 | # 9:*->Error |
michael@0 | 96 | # |
michael@0 | 97 | # Now, we classified chars into class |
michael@0 | 98 | # |
michael@0 | 99 | # 00,0e,0f,1b:k0 |
michael@0 | 100 | # 01-0d,10-1a,1c-7f:k1 |
michael@0 | 101 | # 80-8f:k2 |
michael@0 | 102 | # 90-9f:k3 |
michael@0 | 103 | # a0-bf:k4 |
michael@0 | 104 | # c0-c1:k0 |
michael@0 | 105 | # c2-df:k5 |
michael@0 | 106 | # e0:k6 |
michael@0 | 107 | # e1-ec:k7 |
michael@0 | 108 | # ed:k8 |
michael@0 | 109 | # ee-ef:k7 |
michael@0 | 110 | # f0:k9 |
michael@0 | 111 | # f1-f3:k10 |
michael@0 | 112 | # f4:k11 |
michael@0 | 113 | # f5-ff:k0 |
michael@0 | 114 | # |
michael@0 | 115 | # Now, let's put them into array form |
michael@0 | 116 | |
michael@0 | 117 | @utf8_cls = ( |
michael@0 | 118 | [ 0x00 , 0x00 , 1 ], |
michael@0 | 119 | [ 0x0e , 0x0f , 0 ], |
michael@0 | 120 | [ 0x1b , 0x1b , 0 ], |
michael@0 | 121 | [ 0x01 , 0x0d , 1 ], |
michael@0 | 122 | [ 0x10 , 0x1a , 1 ], |
michael@0 | 123 | [ 0x1c , 0x7f , 1 ], |
michael@0 | 124 | [ 0x80 , 0x8f , 2 ], |
michael@0 | 125 | [ 0x90 , 0x9f , 3 ], |
michael@0 | 126 | [ 0xa0 , 0xbf , 4 ], |
michael@0 | 127 | [ 0xc0 , 0xc1 , 0 ], |
michael@0 | 128 | [ 0xc2 , 0xdf , 5 ], |
michael@0 | 129 | [ 0xe0 , 0xe0 , 6 ], |
michael@0 | 130 | [ 0xe1 , 0xec , 7 ], |
michael@0 | 131 | [ 0xed , 0xed , 8 ], |
michael@0 | 132 | [ 0xee , 0xef , 7 ], |
michael@0 | 133 | [ 0xf0 , 0xf0 , 9 ], |
michael@0 | 134 | [ 0xf1 , 0xf3 , 10 ], |
michael@0 | 135 | [ 0xf4 , 0xf4 , 11 ], |
michael@0 | 136 | [ 0xf5 , 0xff , 0 ], |
michael@0 | 137 | ); |
michael@0 | 138 | # |
michael@0 | 139 | # Now, we write the state diagram in class |
michael@0 | 140 | # |
michael@0 | 141 | # 0:k0->Error |
michael@0 | 142 | # 0:k1->0 |
michael@0 | 143 | # 0:k5->3 |
michael@0 | 144 | # 0:k6->4 |
michael@0 | 145 | # 0:k7->5 |
michael@0 | 146 | # 0:k8->6 |
michael@0 | 147 | # 0:k9->7 |
michael@0 | 148 | # 0:k10->8 |
michael@0 | 149 | # 0:k11->9 |
michael@0 | 150 | # 0:*->Error |
michael@0 | 151 | # 3:k2,k3,k4->0 |
michael@0 | 152 | # 3:*->Error |
michael@0 | 153 | # 4:k4->3 |
michael@0 | 154 | # 4:*->Error |
michael@0 | 155 | # 5:k2,k3,k4->3 |
michael@0 | 156 | # 5:*->Error |
michael@0 | 157 | # 6:k2,k3->3 |
michael@0 | 158 | # 6:*->Error |
michael@0 | 159 | # 7:k3,k4->5 |
michael@0 | 160 | # 7:*->Error |
michael@0 | 161 | # 8:k2,k3,k4->5 |
michael@0 | 162 | # 8:*->Error |
michael@0 | 163 | # 9:k2->5 |
michael@0 | 164 | # 9:*->Error |
michael@0 | 165 | # |
michael@0 | 166 | # Now, let's put them into array |
michael@0 | 167 | # |
michael@0 | 168 | package genverifier; |
michael@0 | 169 | @utf8_st = ( |
michael@0 | 170 | # 0 1 2 3 4 5 6 7 8 9 10 11 |
michael@0 | 171 | 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start |
michael@0 | 172 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error |
michael@0 | 173 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe |
michael@0 | 174 | 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 |
michael@0 | 175 | 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 |
michael@0 | 176 | 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 |
michael@0 | 177 | 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 |
michael@0 | 178 | 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 |
michael@0 | 179 | 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 |
michael@0 | 180 | 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 |
michael@0 | 181 | ); |
michael@0 | 182 | |
michael@0 | 183 | |
michael@0 | 184 | |
michael@0 | 185 | $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); |
michael@0 | 186 | print $utf8_ver; |
michael@0 | 187 | |
michael@0 | 188 | |
michael@0 | 189 |