michael@0: #!/usr/local/bin/perl michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: use strict; michael@0: require "genverifier.pm"; michael@0: use genverifier; michael@0: michael@0: michael@0: my(@utf8_cls); michael@0: my(@utf8_st); michael@0: my($utf8_ver); michael@0: michael@0: # michael@0: # michael@0: # UTF8 encode the UCS4 into 1 to 4 bytes michael@0: # michael@0: # 1 byte 00 00 00 00 00 00 00 7f michael@0: # 2 bytes 00 00 00 80 00 00 07 ff michael@0: # 3 bytes 00 00 08 00 00 00 ff ff michael@0: # 4 bytes 00 01 00 00 00 10 ff ff michael@0: # michael@0: # However, since Surrogate area should not be encoded into UTF8 as michael@0: # a Surrogate pair, we can remove the surrogate area from UTF8 michael@0: # michael@0: # 1 byte 00 00 00 00 00 00 00 7f michael@0: # 2 bytes 00 00 00 80 00 00 07 ff michael@0: # 3 bytes 00 00 08 00 00 00 d7 ff michael@0: # 00 00 e0 00 00 00 ff ff michael@0: # 4 bytes 00 01 00 00 00 10 ff ff michael@0: # michael@0: # Now we break them into 6 bits group for 2-4 bytes UTF8 michael@0: # michael@0: # 1 byte 00 7f michael@0: # 2 bytes 02 00 1f 3f michael@0: # 3 bytes 00 20 00 0d 1f 3f michael@0: # 0e 00 00 0f 3f 3f michael@0: # 4 bytes 00 10 00 00 04 0f 3f 3f michael@0: # michael@0: # Break down more michael@0: # michael@0: # 1 byte 00 7f michael@0: # 2 bytes 02 00 1f 3f michael@0: # 3 bytes 00 20 00 00 3f 3f michael@0: # 01 00 00 0c 3f 3f michael@0: # 0d 00 00 0d 1f 3f michael@0: # 0e 00 00 0f 3f 3f michael@0: # 4 bytes 00 10 00 00 00 3f 3f 3f michael@0: # 01 00 00 00 03 3f 3f 3f michael@0: # 04 00 00 00 04 0f 3f 3f michael@0: # michael@0: # Now, add michael@0: # c0 to the lead byte of 2 bytes UTF8 michael@0: # e0 to the lead byte of 3 bytes UTF8 michael@0: # f0 to the lead byte of 4 bytes UTF8 michael@0: # 80 to the trail bytes michael@0: # michael@0: # 1 byte 00 7f michael@0: # 2 bytes c2 80 df bf michael@0: # 3 bytes e0 a0 80 e0 bf bf michael@0: # e1 80 80 ec bf bf michael@0: # ed 80 80 ed 9f bf michael@0: # ee 80 80 ef bf bf michael@0: # 4 bytes f0 90 80 80 f0 bf bf bf michael@0: # f1 80 80 80 f3 bf bf bf michael@0: # f4 80 80 80 f4 8f bf bf michael@0: # michael@0: # michael@0: # Now we can construct our state diagram michael@0: # michael@0: # 0:0x0e,0x0f,0x1b->Error michael@0: # 0:[0-0x7f]->0 michael@0: # 0:[c2-df]->3 michael@0: # 0:e0->4 michael@0: # 0:[e1-ec, ee-ef]->5 michael@0: # 0:ed->6 michael@0: # 0:f0->7 michael@0: # 0:[f1-f3]->8 michael@0: # 0:f4->9 michael@0: # 0:*->Error michael@0: # 3:[80-bf]->0 michael@0: # 3:*->Error michael@0: # 4:[a0-bf]->3 michael@0: # 4:*->Error michael@0: # 5:[80-bf]->3 michael@0: # 5:*->Error michael@0: # 6:[80-9f]->3 michael@0: # 6:*->Error michael@0: # 7:[90-bf]->5 michael@0: # 7:*->Error michael@0: # 8:[80-bf]->5 michael@0: # 8:*->Error michael@0: # 9:[80-8f]->5 michael@0: # 9:*->Error michael@0: # michael@0: # Now, we classified chars into class michael@0: # michael@0: # 00,0e,0f,1b:k0 michael@0: # 01-0d,10-1a,1c-7f:k1 michael@0: # 80-8f:k2 michael@0: # 90-9f:k3 michael@0: # a0-bf:k4 michael@0: # c0-c1:k0 michael@0: # c2-df:k5 michael@0: # e0:k6 michael@0: # e1-ec:k7 michael@0: # ed:k8 michael@0: # ee-ef:k7 michael@0: # f0:k9 michael@0: # f1-f3:k10 michael@0: # f4:k11 michael@0: # f5-ff:k0 michael@0: # michael@0: # Now, let's put them into array form michael@0: michael@0: @utf8_cls = ( michael@0: [ 0x00 , 0x00 , 1 ], michael@0: [ 0x0e , 0x0f , 0 ], michael@0: [ 0x1b , 0x1b , 0 ], michael@0: [ 0x01 , 0x0d , 1 ], michael@0: [ 0x10 , 0x1a , 1 ], michael@0: [ 0x1c , 0x7f , 1 ], michael@0: [ 0x80 , 0x8f , 2 ], michael@0: [ 0x90 , 0x9f , 3 ], michael@0: [ 0xa0 , 0xbf , 4 ], michael@0: [ 0xc0 , 0xc1 , 0 ], michael@0: [ 0xc2 , 0xdf , 5 ], michael@0: [ 0xe0 , 0xe0 , 6 ], michael@0: [ 0xe1 , 0xec , 7 ], michael@0: [ 0xed , 0xed , 8 ], michael@0: [ 0xee , 0xef , 7 ], michael@0: [ 0xf0 , 0xf0 , 9 ], michael@0: [ 0xf1 , 0xf3 , 10 ], michael@0: [ 0xf4 , 0xf4 , 11 ], michael@0: [ 0xf5 , 0xff , 0 ], michael@0: ); michael@0: # michael@0: # Now, we write the state diagram in class michael@0: # michael@0: # 0:k0->Error michael@0: # 0:k1->0 michael@0: # 0:k5->3 michael@0: # 0:k6->4 michael@0: # 0:k7->5 michael@0: # 0:k8->6 michael@0: # 0:k9->7 michael@0: # 0:k10->8 michael@0: # 0:k11->9 michael@0: # 0:*->Error michael@0: # 3:k2,k3,k4->0 michael@0: # 3:*->Error michael@0: # 4:k4->3 michael@0: # 4:*->Error michael@0: # 5:k2,k3,k4->3 michael@0: # 5:*->Error michael@0: # 6:k2,k3->3 michael@0: # 6:*->Error michael@0: # 7:k3,k4->5 michael@0: # 7:*->Error michael@0: # 8:k2,k3,k4->5 michael@0: # 8:*->Error michael@0: # 9:k2->5 michael@0: # 9:*->Error michael@0: # michael@0: # Now, let's put them into array michael@0: # michael@0: package genverifier; michael@0: @utf8_st = ( michael@0: # 0 1 2 3 4 5 6 7 8 9 10 11 michael@0: 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start michael@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error michael@0: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe michael@0: 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 michael@0: 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 michael@0: 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 michael@0: 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 michael@0: 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 michael@0: 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 michael@0: 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 michael@0: ); michael@0: michael@0: michael@0: michael@0: $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); michael@0: print $utf8_ver; michael@0: michael@0: michael@0: