Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | #include "nsCodingStateMachine.h" |
michael@0 | 6 | |
michael@0 | 7 | /* |
michael@0 | 8 | Modification from frank tang's original work: |
michael@0 | 9 | . 0x00 is allowed as a legal character. Since some web pages contains this char in |
michael@0 | 10 | text stream. |
michael@0 | 11 | */ |
michael@0 | 12 | |
michael@0 | 13 | // BIG5 |
michael@0 | 14 | |
michael@0 | 15 | static const uint32_t BIG5_cls [ 256 / 8 ] = { |
michael@0 | 16 | //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 17 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value |
michael@0 | 18 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
michael@0 | 19 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
michael@0 | 20 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
michael@0 | 21 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
michael@0 | 22 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
michael@0 | 23 | PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 |
michael@0 | 24 | PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f |
michael@0 | 25 | PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 |
michael@0 | 26 | PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f |
michael@0 | 27 | PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 |
michael@0 | 28 | PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f |
michael@0 | 29 | PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 |
michael@0 | 30 | PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f |
michael@0 | 31 | PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 |
michael@0 | 32 | PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f |
michael@0 | 33 | PCK4BITS(4,4,4,4,4,4,4,4), // 80 - 87 |
michael@0 | 34 | PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f |
michael@0 | 35 | PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 |
michael@0 | 36 | PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f |
michael@0 | 37 | PCK4BITS(4,3,3,3,3,3,3,3), // a0 - a7 |
michael@0 | 38 | PCK4BITS(3,3,3,3,3,3,3,3), // a8 - af |
michael@0 | 39 | PCK4BITS(3,3,3,3,3,3,3,3), // b0 - b7 |
michael@0 | 40 | PCK4BITS(3,3,3,3,3,3,3,3), // b8 - bf |
michael@0 | 41 | PCK4BITS(3,3,3,3,3,3,3,3), // c0 - c7 |
michael@0 | 42 | PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf |
michael@0 | 43 | PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 |
michael@0 | 44 | PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df |
michael@0 | 45 | PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 |
michael@0 | 46 | PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef |
michael@0 | 47 | PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 |
michael@0 | 48 | PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff |
michael@0 | 49 | }; |
michael@0 | 50 | |
michael@0 | 51 | |
michael@0 | 52 | static const uint32_t BIG5_st [ 3] = { |
michael@0 | 53 | PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 |
michael@0 | 54 | PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f |
michael@0 | 55 | PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 |
michael@0 | 56 | }; |
michael@0 | 57 | |
michael@0 | 58 | static const uint32_t Big5CharLenTable[] = {0, 1, 1, 2, 0}; |
michael@0 | 59 | |
michael@0 | 60 | SMModel const Big5SMModel = { |
michael@0 | 61 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls }, |
michael@0 | 62 | 5, |
michael@0 | 63 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, |
michael@0 | 64 | CHAR_LEN_TABLE(Big5CharLenTable), |
michael@0 | 65 | "Big5", |
michael@0 | 66 | }; |
michael@0 | 67 | |
michael@0 | 68 | static const uint32_t EUCJP_cls [ 256 / 8 ] = { |
michael@0 | 69 | //PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 |
michael@0 | 70 | PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 |
michael@0 | 71 | PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f |
michael@0 | 72 | PCK4BITS(4,4,4,4,4,4,4,4), // 10 - 17 |
michael@0 | 73 | PCK4BITS(4,4,4,5,4,4,4,4), // 18 - 1f |
michael@0 | 74 | PCK4BITS(4,4,4,4,4,4,4,4), // 20 - 27 |
michael@0 | 75 | PCK4BITS(4,4,4,4,4,4,4,4), // 28 - 2f |
michael@0 | 76 | PCK4BITS(4,4,4,4,4,4,4,4), // 30 - 37 |
michael@0 | 77 | PCK4BITS(4,4,4,4,4,4,4,4), // 38 - 3f |
michael@0 | 78 | PCK4BITS(4,4,4,4,4,4,4,4), // 40 - 47 |
michael@0 | 79 | PCK4BITS(4,4,4,4,4,4,4,4), // 48 - 4f |
michael@0 | 80 | PCK4BITS(4,4,4,4,4,4,4,4), // 50 - 57 |
michael@0 | 81 | PCK4BITS(4,4,4,4,4,4,4,4), // 58 - 5f |
michael@0 | 82 | PCK4BITS(4,4,4,4,4,4,4,4), // 60 - 67 |
michael@0 | 83 | PCK4BITS(4,4,4,4,4,4,4,4), // 68 - 6f |
michael@0 | 84 | PCK4BITS(4,4,4,4,4,4,4,4), // 70 - 77 |
michael@0 | 85 | PCK4BITS(4,4,4,4,4,4,4,4), // 78 - 7f |
michael@0 | 86 | PCK4BITS(5,5,5,5,5,5,5,5), // 80 - 87 |
michael@0 | 87 | PCK4BITS(5,5,5,5,5,5,1,3), // 88 - 8f |
michael@0 | 88 | PCK4BITS(5,5,5,5,5,5,5,5), // 90 - 97 |
michael@0 | 89 | PCK4BITS(5,5,5,5,5,5,5,5), // 98 - 9f |
michael@0 | 90 | PCK4BITS(5,2,2,2,2,2,2,2), // a0 - a7 |
michael@0 | 91 | PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af |
michael@0 | 92 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 |
michael@0 | 93 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf |
michael@0 | 94 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 |
michael@0 | 95 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf |
michael@0 | 96 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 |
michael@0 | 97 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df |
michael@0 | 98 | PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 |
michael@0 | 99 | PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef |
michael@0 | 100 | PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 |
michael@0 | 101 | PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff |
michael@0 | 102 | }; |
michael@0 | 103 | |
michael@0 | 104 | |
michael@0 | 105 | static const uint32_t EUCJP_st [ 5] = { |
michael@0 | 106 | PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 |
michael@0 | 107 | PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f |
michael@0 | 108 | PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 |
michael@0 | 109 | PCK4BITS(eError,eError,eStart,eError,eError,eError, 3,eError),//18-1f |
michael@0 | 110 | PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27 |
michael@0 | 111 | }; |
michael@0 | 112 | |
michael@0 | 113 | static const uint32_t EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; |
michael@0 | 114 | |
michael@0 | 115 | const SMModel EUCJPSMModel = { |
michael@0 | 116 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls }, |
michael@0 | 117 | 6, |
michael@0 | 118 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st }, |
michael@0 | 119 | CHAR_LEN_TABLE(EUCJPCharLenTable), |
michael@0 | 120 | "EUC-JP", |
michael@0 | 121 | }; |
michael@0 | 122 | |
michael@0 | 123 | static const uint32_t EUCKR_cls [ 256 / 8 ] = { |
michael@0 | 124 | //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 125 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 126 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
michael@0 | 127 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
michael@0 | 128 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
michael@0 | 129 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
michael@0 | 130 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
michael@0 | 131 | PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 |
michael@0 | 132 | PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f |
michael@0 | 133 | PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 |
michael@0 | 134 | PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f |
michael@0 | 135 | PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 |
michael@0 | 136 | PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f |
michael@0 | 137 | PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 |
michael@0 | 138 | PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f |
michael@0 | 139 | PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 |
michael@0 | 140 | PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f |
michael@0 | 141 | PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 |
michael@0 | 142 | PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f |
michael@0 | 143 | PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 |
michael@0 | 144 | PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f |
michael@0 | 145 | PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 |
michael@0 | 146 | PCK4BITS(2,2,2,2,2,3,3,3), // a8 - af |
michael@0 | 147 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 |
michael@0 | 148 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf |
michael@0 | 149 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 |
michael@0 | 150 | PCK4BITS(2,3,2,2,2,2,2,2), // c8 - cf |
michael@0 | 151 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 |
michael@0 | 152 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df |
michael@0 | 153 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 |
michael@0 | 154 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef |
michael@0 | 155 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 |
michael@0 | 156 | PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff |
michael@0 | 157 | }; |
michael@0 | 158 | |
michael@0 | 159 | |
michael@0 | 160 | static const uint32_t EUCKR_st [ 2] = { |
michael@0 | 161 | PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 |
michael@0 | 162 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f |
michael@0 | 163 | }; |
michael@0 | 164 | |
michael@0 | 165 | static const uint32_t EUCKRCharLenTable[] = {0, 1, 2, 0}; |
michael@0 | 166 | |
michael@0 | 167 | const SMModel EUCKRSMModel = { |
michael@0 | 168 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls }, |
michael@0 | 169 | 4, |
michael@0 | 170 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st }, |
michael@0 | 171 | CHAR_LEN_TABLE(EUCKRCharLenTable), |
michael@0 | 172 | "EUC-KR", |
michael@0 | 173 | }; |
michael@0 | 174 | |
michael@0 | 175 | static const uint32_t EUCTW_cls [ 256 / 8 ] = { |
michael@0 | 176 | //PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 |
michael@0 | 177 | PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 |
michael@0 | 178 | PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f |
michael@0 | 179 | PCK4BITS(2,2,2,2,2,2,2,2), // 10 - 17 |
michael@0 | 180 | PCK4BITS(2,2,2,0,2,2,2,2), // 18 - 1f |
michael@0 | 181 | PCK4BITS(2,2,2,2,2,2,2,2), // 20 - 27 |
michael@0 | 182 | PCK4BITS(2,2,2,2,2,2,2,2), // 28 - 2f |
michael@0 | 183 | PCK4BITS(2,2,2,2,2,2,2,2), // 30 - 37 |
michael@0 | 184 | PCK4BITS(2,2,2,2,2,2,2,2), // 38 - 3f |
michael@0 | 185 | PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 |
michael@0 | 186 | PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f |
michael@0 | 187 | PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 |
michael@0 | 188 | PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f |
michael@0 | 189 | PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 |
michael@0 | 190 | PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f |
michael@0 | 191 | PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 |
michael@0 | 192 | PCK4BITS(2,2,2,2,2,2,2,2), // 78 - 7f |
michael@0 | 193 | PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 |
michael@0 | 194 | PCK4BITS(0,0,0,0,0,0,6,0), // 88 - 8f |
michael@0 | 195 | PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 |
michael@0 | 196 | PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f |
michael@0 | 197 | PCK4BITS(0,3,4,4,4,4,4,4), // a0 - a7 |
michael@0 | 198 | PCK4BITS(5,5,1,1,1,1,1,1), // a8 - af |
michael@0 | 199 | PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7 |
michael@0 | 200 | PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf |
michael@0 | 201 | PCK4BITS(1,1,3,1,3,3,3,3), // c0 - c7 |
michael@0 | 202 | PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf |
michael@0 | 203 | PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 |
michael@0 | 204 | PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df |
michael@0 | 205 | PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 |
michael@0 | 206 | PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef |
michael@0 | 207 | PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 |
michael@0 | 208 | PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff |
michael@0 | 209 | }; |
michael@0 | 210 | |
michael@0 | 211 | |
michael@0 | 212 | static const uint32_t EUCTW_st [ 6] = { |
michael@0 | 213 | PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 |
michael@0 | 214 | PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f |
michael@0 | 215 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 |
michael@0 | 216 | PCK4BITS(eStart,eStart,eStart,eError,eError,eError,eError,eError),//18-1f |
michael@0 | 217 | PCK4BITS( 5,eError,eError,eError,eStart,eError,eStart,eStart),//20-27 |
michael@0 | 218 | PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f |
michael@0 | 219 | }; |
michael@0 | 220 | |
michael@0 | 221 | static const uint32_t EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; |
michael@0 | 222 | |
michael@0 | 223 | const SMModel EUCTWSMModel = { |
michael@0 | 224 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls }, |
michael@0 | 225 | 7, |
michael@0 | 226 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, |
michael@0 | 227 | CHAR_LEN_TABLE(EUCTWCharLenTable), |
michael@0 | 228 | "x-euc-tw", |
michael@0 | 229 | }; |
michael@0 | 230 | |
michael@0 | 231 | /* obsolete GB2312 by gb18030 |
michael@0 | 232 | static uint32_t GB2312_cls [ 256 / 8 ] = { |
michael@0 | 233 | //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 234 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 235 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
michael@0 | 236 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
michael@0 | 237 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
michael@0 | 238 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
michael@0 | 239 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
michael@0 | 240 | PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 |
michael@0 | 241 | PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f |
michael@0 | 242 | PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 |
michael@0 | 243 | PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f |
michael@0 | 244 | PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 |
michael@0 | 245 | PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f |
michael@0 | 246 | PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 |
michael@0 | 247 | PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f |
michael@0 | 248 | PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 |
michael@0 | 249 | PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f |
michael@0 | 250 | PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87 |
michael@0 | 251 | PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f |
michael@0 | 252 | PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 |
michael@0 | 253 | PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f |
michael@0 | 254 | PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 |
michael@0 | 255 | PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af |
michael@0 | 256 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 |
michael@0 | 257 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf |
michael@0 | 258 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 |
michael@0 | 259 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf |
michael@0 | 260 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 |
michael@0 | 261 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df |
michael@0 | 262 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 |
michael@0 | 263 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef |
michael@0 | 264 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 |
michael@0 | 265 | PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff |
michael@0 | 266 | }; |
michael@0 | 267 | |
michael@0 | 268 | |
michael@0 | 269 | static uint32_t GB2312_st [ 2] = { |
michael@0 | 270 | PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 |
michael@0 | 271 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f |
michael@0 | 272 | }; |
michael@0 | 273 | |
michael@0 | 274 | static const uint32_t GB2312CharLenTable[] = {0, 1, 2, 0}; |
michael@0 | 275 | |
michael@0 | 276 | SMModel GB2312SMModel = { |
michael@0 | 277 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls }, |
michael@0 | 278 | 4, |
michael@0 | 279 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st }, |
michael@0 | 280 | CHAR_LEN_TABLE(GB2312CharLenTable), |
michael@0 | 281 | "GB2312", |
michael@0 | 282 | }; |
michael@0 | 283 | */ |
michael@0 | 284 | |
michael@0 | 285 | // the following state machine data was created by perl script in |
michael@0 | 286 | // intl/chardet/tools. It should be the same as in PSM detector. |
michael@0 | 287 | static const uint32_t GB18030_cls [ 256 / 8 ] = { |
michael@0 | 288 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 289 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
michael@0 | 290 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
michael@0 | 291 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
michael@0 | 292 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
michael@0 | 293 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
michael@0 | 294 | PCK4BITS(3,3,3,3,3,3,3,3), // 30 - 37 |
michael@0 | 295 | PCK4BITS(3,3,1,1,1,1,1,1), // 38 - 3f |
michael@0 | 296 | PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 |
michael@0 | 297 | PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f |
michael@0 | 298 | PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 |
michael@0 | 299 | PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f |
michael@0 | 300 | PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 |
michael@0 | 301 | PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f |
michael@0 | 302 | PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 |
michael@0 | 303 | PCK4BITS(2,2,2,2,2,2,2,4), // 78 - 7f |
michael@0 | 304 | PCK4BITS(5,6,6,6,6,6,6,6), // 80 - 87 |
michael@0 | 305 | PCK4BITS(6,6,6,6,6,6,6,6), // 88 - 8f |
michael@0 | 306 | PCK4BITS(6,6,6,6,6,6,6,6), // 90 - 97 |
michael@0 | 307 | PCK4BITS(6,6,6,6,6,6,6,6), // 98 - 9f |
michael@0 | 308 | PCK4BITS(6,6,6,6,6,6,6,6), // a0 - a7 |
michael@0 | 309 | PCK4BITS(6,6,6,6,6,6,6,6), // a8 - af |
michael@0 | 310 | PCK4BITS(6,6,6,6,6,6,6,6), // b0 - b7 |
michael@0 | 311 | PCK4BITS(6,6,6,6,6,6,6,6), // b8 - bf |
michael@0 | 312 | PCK4BITS(6,6,6,6,6,6,6,6), // c0 - c7 |
michael@0 | 313 | PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf |
michael@0 | 314 | PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 |
michael@0 | 315 | PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df |
michael@0 | 316 | PCK4BITS(6,6,6,6,6,6,6,6), // e0 - e7 |
michael@0 | 317 | PCK4BITS(6,6,6,6,6,6,6,6), // e8 - ef |
michael@0 | 318 | PCK4BITS(6,6,6,6,6,6,6,6), // f0 - f7 |
michael@0 | 319 | PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff |
michael@0 | 320 | }; |
michael@0 | 321 | |
michael@0 | 322 | |
michael@0 | 323 | static const uint32_t GB18030_st [ 6] = { |
michael@0 | 324 | PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 |
michael@0 | 325 | PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f |
michael@0 | 326 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 |
michael@0 | 327 | PCK4BITS( 4,eError,eStart,eStart,eError,eError,eError,eError),//18-1f |
michael@0 | 328 | PCK4BITS(eError,eError, 5,eError,eError,eError,eItsMe,eError),//20-27 |
michael@0 | 329 | PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f |
michael@0 | 330 | }; |
michael@0 | 331 | |
michael@0 | 332 | // To be accurate, the length of class 6 can be either 2 or 4. |
michael@0 | 333 | // But it is not necessary to discriminate between the two since |
michael@0 | 334 | // it is used for frequency analysis only, and we are validing |
michael@0 | 335 | // each code range there as well. So it is safe to set it to be |
michael@0 | 336 | // 2 here. |
michael@0 | 337 | static const uint32_t GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; |
michael@0 | 338 | |
michael@0 | 339 | const SMModel GB18030SMModel = { |
michael@0 | 340 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls }, |
michael@0 | 341 | 7, |
michael@0 | 342 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st }, |
michael@0 | 343 | CHAR_LEN_TABLE(GB18030CharLenTable), |
michael@0 | 344 | "GB18030", |
michael@0 | 345 | }; |
michael@0 | 346 | |
michael@0 | 347 | // sjis |
michael@0 | 348 | |
michael@0 | 349 | static const uint32_t SJIS_cls [ 256 / 8 ] = { |
michael@0 | 350 | //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 351 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 |
michael@0 | 352 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
michael@0 | 353 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
michael@0 | 354 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
michael@0 | 355 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
michael@0 | 356 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
michael@0 | 357 | PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 |
michael@0 | 358 | PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f |
michael@0 | 359 | PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 |
michael@0 | 360 | PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f |
michael@0 | 361 | PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 |
michael@0 | 362 | PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f |
michael@0 | 363 | PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 |
michael@0 | 364 | PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f |
michael@0 | 365 | PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 |
michael@0 | 366 | PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f |
michael@0 | 367 | PCK4BITS(3,3,3,3,3,3,3,3), // 80 - 87 |
michael@0 | 368 | PCK4BITS(3,3,3,3,3,3,3,3), // 88 - 8f |
michael@0 | 369 | PCK4BITS(3,3,3,3,3,3,3,3), // 90 - 97 |
michael@0 | 370 | PCK4BITS(3,3,3,3,3,3,3,3), // 98 - 9f |
michael@0 | 371 | //0xa0 is illegal in sjis encoding, but some pages does |
michael@0 | 372 | //contain such byte. We need to be more error forgiven. |
michael@0 | 373 | PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 |
michael@0 | 374 | PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af |
michael@0 | 375 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 |
michael@0 | 376 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf |
michael@0 | 377 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 |
michael@0 | 378 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf |
michael@0 | 379 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 |
michael@0 | 380 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df |
michael@0 | 381 | PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 |
michael@0 | 382 | PCK4BITS(3,3,3,3,3,4,4,4), // e8 - ef |
michael@0 | 383 | PCK4BITS(4,4,4,4,4,4,4,4), // f0 - f7 |
michael@0 | 384 | PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff |
michael@0 | 385 | }; |
michael@0 | 386 | |
michael@0 | 387 | |
michael@0 | 388 | static const uint32_t SJIS_st [ 3] = { |
michael@0 | 389 | PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 |
michael@0 | 390 | PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f |
michael@0 | 391 | PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 |
michael@0 | 392 | }; |
michael@0 | 393 | |
michael@0 | 394 | static const uint32_t SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; |
michael@0 | 395 | |
michael@0 | 396 | const SMModel SJISSMModel = { |
michael@0 | 397 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls }, |
michael@0 | 398 | 6, |
michael@0 | 399 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, |
michael@0 | 400 | CHAR_LEN_TABLE(SJISCharLenTable), |
michael@0 | 401 | "Shift_JIS", |
michael@0 | 402 | }; |
michael@0 | 403 | |
michael@0 | 404 | |
michael@0 | 405 | static const uint32_t UTF8_cls [ 256 / 8 ] = { |
michael@0 | 406 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 |
michael@0 | 407 | PCK4BITS( 1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
michael@0 | 408 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
michael@0 | 409 | PCK4BITS( 1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
michael@0 | 410 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
michael@0 | 411 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
michael@0 | 412 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 |
michael@0 | 413 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f |
michael@0 | 414 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 |
michael@0 | 415 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f |
michael@0 | 416 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 |
michael@0 | 417 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f |
michael@0 | 418 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 |
michael@0 | 419 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f |
michael@0 | 420 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 |
michael@0 | 421 | PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f |
michael@0 | 422 | PCK4BITS( 2, 2, 2, 2, 2, 2, 2, 2), // 80 - 87 |
michael@0 | 423 | PCK4BITS( 2, 2, 2, 2, 2, 2, 2, 2), // 88 - 8f |
michael@0 | 424 | PCK4BITS( 3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97 |
michael@0 | 425 | PCK4BITS( 3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f |
michael@0 | 426 | PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // a0 - a7 |
michael@0 | 427 | PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // a8 - af |
michael@0 | 428 | PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // b0 - b7 |
michael@0 | 429 | PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // b8 - bf |
michael@0 | 430 | PCK4BITS( 0, 0, 5, 5, 5, 5, 5, 5), // c0 - c7 |
michael@0 | 431 | PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // c8 - cf |
michael@0 | 432 | PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // d0 - d7 |
michael@0 | 433 | PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // d8 - df |
michael@0 | 434 | PCK4BITS( 6, 7, 7, 7, 7, 7, 7, 7), // e0 - e7 |
michael@0 | 435 | PCK4BITS( 7, 7, 7, 7, 7, 8, 7, 7), // e8 - ef |
michael@0 | 436 | PCK4BITS( 9,10,10,10,11, 0, 0, 0), // f0 - f7 |
michael@0 | 437 | PCK4BITS( 0, 0, 0, 0, 0, 0, 0, 0) // f8 - ff |
michael@0 | 438 | }; |
michael@0 | 439 | |
michael@0 | 440 | |
michael@0 | 441 | static const uint32_t UTF8_st [ 15] = { |
michael@0 | 442 | PCK4BITS(eError,eStart,eError,eError,eError, 3, 4, 5), // 00 - 07 |
michael@0 | 443 | PCK4BITS( 6, 7, 8, 9,eError,eError,eError,eError), // 08 - 0f |
michael@0 | 444 | PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError), // 10 - 17 |
michael@0 | 445 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe), // 18 - 1f |
michael@0 | 446 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart), // 20 - 27 |
michael@0 | 447 | PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError), // 28 - 2f |
michael@0 | 448 | PCK4BITS(eError,eError,eError,eError, 3,eError,eError,eError), // 30 - 37 |
michael@0 | 449 | PCK4BITS(eError,eError,eError,eError,eError,eError, 3, 3), // 38 - 3f |
michael@0 | 450 | PCK4BITS( 3,eError,eError,eError,eError,eError,eError,eError), // 40 - 47 |
michael@0 | 451 | PCK4BITS(eError,eError, 3, 3,eError,eError,eError,eError), // 48 - 4f |
michael@0 | 452 | PCK4BITS(eError,eError,eError,eError,eError,eError, 5, 5), // 50 - 57 |
michael@0 | 453 | PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError), // 58 - 5f |
michael@0 | 454 | PCK4BITS(eError,eError, 5, 5, 5,eError,eError,eError), // 60 - 67 |
michael@0 | 455 | PCK4BITS(eError,eError,eError,eError,eError,eError, 5,eError), // 68 - 6f |
michael@0 | 456 | PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) // 70 - 77 |
michael@0 | 457 | }; |
michael@0 | 458 | |
michael@0 | 459 | static const uint32_t UTF8CharLenTable[] = {0, 1, 0, 0, 0, 2, 3, 3, 3, 4, 4, 4}; |
michael@0 | 460 | |
michael@0 | 461 | const SMModel UTF8SMModel = { |
michael@0 | 462 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, |
michael@0 | 463 | 12, |
michael@0 | 464 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, |
michael@0 | 465 | CHAR_LEN_TABLE(UTF8CharLenTable), |
michael@0 | 466 | "UTF-8", |
michael@0 | 467 | }; |