intl/chardet/tools/genutf8.pl

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 #!/usr/local/bin/perl
michael@0 2 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 5
michael@0 6 use strict;
michael@0 7 require "genverifier.pm";
michael@0 8 use genverifier;
michael@0 9
michael@0 10
michael@0 11 my(@utf8_cls);
michael@0 12 my(@utf8_st);
michael@0 13 my($utf8_ver);
michael@0 14
michael@0 15 #
michael@0 16 #
michael@0 17 # UTF8 encode the UCS4 into 1 to 4 bytes
michael@0 18 #
michael@0 19 # 1 byte 00 00 00 00 00 00 00 7f
michael@0 20 # 2 bytes 00 00 00 80 00 00 07 ff
michael@0 21 # 3 bytes 00 00 08 00 00 00 ff ff
michael@0 22 # 4 bytes 00 01 00 00 00 10 ff ff
michael@0 23 #
michael@0 24 # However, since Surrogate area should not be encoded into UTF8 as
michael@0 25 # a Surrogate pair, we can remove the surrogate area from UTF8
michael@0 26 #
michael@0 27 # 1 byte 00 00 00 00 00 00 00 7f
michael@0 28 # 2 bytes 00 00 00 80 00 00 07 ff
michael@0 29 # 3 bytes 00 00 08 00 00 00 d7 ff
michael@0 30 # 00 00 e0 00 00 00 ff ff
michael@0 31 # 4 bytes 00 01 00 00 00 10 ff ff
michael@0 32 #
michael@0 33 # Now we break them into 6 bits group for 2-4 bytes UTF8
michael@0 34 #
michael@0 35 # 1 byte 00 7f
michael@0 36 # 2 bytes 02 00 1f 3f
michael@0 37 # 3 bytes 00 20 00 0d 1f 3f
michael@0 38 # 0e 00 00 0f 3f 3f
michael@0 39 # 4 bytes 00 10 00 00 04 0f 3f 3f
michael@0 40 #
michael@0 41 # Break down more
michael@0 42 #
michael@0 43 # 1 byte 00 7f
michael@0 44 # 2 bytes 02 00 1f 3f
michael@0 45 # 3 bytes 00 20 00 00 3f 3f
michael@0 46 # 01 00 00 0c 3f 3f
michael@0 47 # 0d 00 00 0d 1f 3f
michael@0 48 # 0e 00 00 0f 3f 3f
michael@0 49 # 4 bytes 00 10 00 00 00 3f 3f 3f
michael@0 50 # 01 00 00 00 03 3f 3f 3f
michael@0 51 # 04 00 00 00 04 0f 3f 3f
michael@0 52 #
michael@0 53 # Now, add
michael@0 54 # c0 to the lead byte of 2 bytes UTF8
michael@0 55 # e0 to the lead byte of 3 bytes UTF8
michael@0 56 # f0 to the lead byte of 4 bytes UTF8
michael@0 57 # 80 to the trail bytes
michael@0 58 #
michael@0 59 # 1 byte 00 7f
michael@0 60 # 2 bytes c2 80 df bf
michael@0 61 # 3 bytes e0 a0 80 e0 bf bf
michael@0 62 # e1 80 80 ec bf bf
michael@0 63 # ed 80 80 ed 9f bf
michael@0 64 # ee 80 80 ef bf bf
michael@0 65 # 4 bytes f0 90 80 80 f0 bf bf bf
michael@0 66 # f1 80 80 80 f3 bf bf bf
michael@0 67 # f4 80 80 80 f4 8f bf bf
michael@0 68 #
michael@0 69 #
michael@0 70 # Now we can construct our state diagram
michael@0 71 #
michael@0 72 # 0:0x0e,0x0f,0x1b->Error
michael@0 73 # 0:[0-0x7f]->0
michael@0 74 # 0:[c2-df]->3
michael@0 75 # 0:e0->4
michael@0 76 # 0:[e1-ec, ee-ef]->5
michael@0 77 # 0:ed->6
michael@0 78 # 0:f0->7
michael@0 79 # 0:[f1-f3]->8
michael@0 80 # 0:f4->9
michael@0 81 # 0:*->Error
michael@0 82 # 3:[80-bf]->0
michael@0 83 # 3:*->Error
michael@0 84 # 4:[a0-bf]->3
michael@0 85 # 4:*->Error
michael@0 86 # 5:[80-bf]->3
michael@0 87 # 5:*->Error
michael@0 88 # 6:[80-9f]->3
michael@0 89 # 6:*->Error
michael@0 90 # 7:[90-bf]->5
michael@0 91 # 7:*->Error
michael@0 92 # 8:[80-bf]->5
michael@0 93 # 8:*->Error
michael@0 94 # 9:[80-8f]->5
michael@0 95 # 9:*->Error
michael@0 96 #
michael@0 97 # Now, we classified chars into class
michael@0 98 #
michael@0 99 # 00,0e,0f,1b:k0
michael@0 100 # 01-0d,10-1a,1c-7f:k1
michael@0 101 # 80-8f:k2
michael@0 102 # 90-9f:k3
michael@0 103 # a0-bf:k4
michael@0 104 # c0-c1:k0
michael@0 105 # c2-df:k5
michael@0 106 # e0:k6
michael@0 107 # e1-ec:k7
michael@0 108 # ed:k8
michael@0 109 # ee-ef:k7
michael@0 110 # f0:k9
michael@0 111 # f1-f3:k10
michael@0 112 # f4:k11
michael@0 113 # f5-ff:k0
michael@0 114 #
michael@0 115 # Now, let's put them into array form
michael@0 116
michael@0 117 @utf8_cls = (
michael@0 118 [ 0x00 , 0x00 , 1 ],
michael@0 119 [ 0x0e , 0x0f , 0 ],
michael@0 120 [ 0x1b , 0x1b , 0 ],
michael@0 121 [ 0x01 , 0x0d , 1 ],
michael@0 122 [ 0x10 , 0x1a , 1 ],
michael@0 123 [ 0x1c , 0x7f , 1 ],
michael@0 124 [ 0x80 , 0x8f , 2 ],
michael@0 125 [ 0x90 , 0x9f , 3 ],
michael@0 126 [ 0xa0 , 0xbf , 4 ],
michael@0 127 [ 0xc0 , 0xc1 , 0 ],
michael@0 128 [ 0xc2 , 0xdf , 5 ],
michael@0 129 [ 0xe0 , 0xe0 , 6 ],
michael@0 130 [ 0xe1 , 0xec , 7 ],
michael@0 131 [ 0xed , 0xed , 8 ],
michael@0 132 [ 0xee , 0xef , 7 ],
michael@0 133 [ 0xf0 , 0xf0 , 9 ],
michael@0 134 [ 0xf1 , 0xf3 , 10 ],
michael@0 135 [ 0xf4 , 0xf4 , 11 ],
michael@0 136 [ 0xf5 , 0xff , 0 ],
michael@0 137 );
michael@0 138 #
michael@0 139 # Now, we write the state diagram in class
michael@0 140 #
michael@0 141 # 0:k0->Error
michael@0 142 # 0:k1->0
michael@0 143 # 0:k5->3
michael@0 144 # 0:k6->4
michael@0 145 # 0:k7->5
michael@0 146 # 0:k8->6
michael@0 147 # 0:k9->7
michael@0 148 # 0:k10->8
michael@0 149 # 0:k11->9
michael@0 150 # 0:*->Error
michael@0 151 # 3:k2,k3,k4->0
michael@0 152 # 3:*->Error
michael@0 153 # 4:k4->3
michael@0 154 # 4:*->Error
michael@0 155 # 5:k2,k3,k4->3
michael@0 156 # 5:*->Error
michael@0 157 # 6:k2,k3->3
michael@0 158 # 6:*->Error
michael@0 159 # 7:k3,k4->5
michael@0 160 # 7:*->Error
michael@0 161 # 8:k2,k3,k4->5
michael@0 162 # 8:*->Error
michael@0 163 # 9:k2->5
michael@0 164 # 9:*->Error
michael@0 165 #
michael@0 166 # Now, let's put them into array
michael@0 167 #
michael@0 168 package genverifier;
michael@0 169 @utf8_st = (
michael@0 170 # 0 1 2 3 4 5 6 7 8 9 10 11
michael@0 171 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start
michael@0 172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
michael@0 173 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
michael@0 174 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3
michael@0 175 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4
michael@0 176 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5
michael@0 177 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
michael@0 178 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
michael@0 179 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8
michael@0 180 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
michael@0 181 );
michael@0 182
michael@0 183
michael@0 184
michael@0 185 $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st);
michael@0 186 print $utf8_ver;
michael@0 187
michael@0 188
michael@0 189

mercurial