Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | /** |
michael@0 | 6 | * A character set converter from HZ to Unicode. |
michael@0 | 7 | * |
michael@0 | 8 | * |
michael@0 | 9 | * @created 08/Sept/1999 |
michael@0 | 10 | * @author Yueheng Xu, Yueheng.Xu@intel.com |
michael@0 | 11 | * |
michael@0 | 12 | * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ |
michael@0 | 13 | * encoded Chinese chars,as it is defined in RFC1843 available at |
michael@0 | 14 | * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html |
michael@0 | 15 | * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html. |
michael@0 | 16 | * |
michael@0 | 17 | * Earlier versions of the converter said: |
michael@0 | 18 | * "In an effort to match the similar extended capability of Microsoft |
michael@0 | 19 | * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars |
michael@0 | 20 | * mixed in a HZ string. |
michael@0 | 21 | * But this should not be a recommendedd practice for HTML authors." |
michael@0 | 22 | * However, testing in current versions of IE shows that it only accepts |
michael@0 | 23 | * 8-bit characters when the converter is in GB state, and when in ASCII |
michael@0 | 24 | * state each single 8-bit character is converted to U+FFFD |
michael@0 | 25 | * |
michael@0 | 26 | * The priority of converting are as follows: first convert 8-bit GB code; then, |
michael@0 | 27 | * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current |
michael@0 | 28 | * state ( default to ASCII state ) of the string, each 7-bit char is converted as an |
michael@0 | 29 | * ASCII, or two 7-bit chars are converted into a Chinese character. |
michael@0 | 30 | */ |
michael@0 | 31 | |
michael@0 | 32 | |
michael@0 | 33 | |
michael@0 | 34 | #include "nsHZToUnicode.h" |
michael@0 | 35 | #include "gbku.h" |
michael@0 | 36 | #include "mozilla/Telemetry.h" |
michael@0 | 37 | |
michael@0 | 38 | //---------------------------------------------------------------------- |
michael@0 | 39 | // Class nsHZToUnicode [implementation] |
michael@0 | 40 | |
michael@0 | 41 | //---------------------------------------------------------------------- |
michael@0 | 42 | // Subclassing of nsTablesDecoderSupport class [implementation] |
michael@0 | 43 | |
michael@0 | 44 | #define HZ_STATE_GB 1 |
michael@0 | 45 | #define HZ_STATE_ASCII 2 |
michael@0 | 46 | #define HZ_STATE_ODD_BYTE_FLAG 0x80 |
michael@0 | 47 | #define HZLEAD1 '~' |
michael@0 | 48 | #define HZLEAD2 '{' |
michael@0 | 49 | #define HZLEAD3 '}' |
michael@0 | 50 | #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG)) |
michael@0 | 51 | #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG)) |
michael@0 | 52 | |
michael@0 | 53 | using namespace mozilla; |
michael@0 | 54 | |
michael@0 | 55 | nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1) |
michael@0 | 56 | { |
michael@0 | 57 | mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state |
michael@0 | 58 | mRunLength = 0; |
michael@0 | 59 | mOddByte = 0; |
michael@0 | 60 | Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true); |
michael@0 | 61 | } |
michael@0 | 62 | |
michael@0 | 63 | //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp. |
michael@0 | 64 | NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff( |
michael@0 | 65 | const char* aSrc, |
michael@0 | 66 | int32_t * aSrcLength, |
michael@0 | 67 | char16_t *aDest, |
michael@0 | 68 | int32_t * aDestLength) |
michael@0 | 69 | { |
michael@0 | 70 | int32_t i=0; |
michael@0 | 71 | int32_t iSrcLength = *aSrcLength; |
michael@0 | 72 | int32_t iDestlen = 0; |
michael@0 | 73 | *aSrcLength=0; |
michael@0 | 74 | nsresult res = NS_OK; |
michael@0 | 75 | char oddByte = mOddByte; |
michael@0 | 76 | |
michael@0 | 77 | for (i=0; i<iSrcLength; i++) { |
michael@0 | 78 | if (iDestlen >= (*aDestLength)) { |
michael@0 | 79 | res = NS_OK_UDEC_MOREOUTPUT; |
michael@0 | 80 | break; |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | char srcByte = *aSrc++; |
michael@0 | 84 | (*aSrcLength)++; |
michael@0 | 85 | |
michael@0 | 86 | if (!HZ_ODD_BYTE_STATE) { |
michael@0 | 87 | if (srcByte == HZLEAD1 || |
michael@0 | 88 | (HZ_ENCODING_STATE == HZ_STATE_GB && |
michael@0 | 89 | (UINT8_IN_RANGE(0x21, srcByte, 0x7E) || |
michael@0 | 90 | UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) { |
michael@0 | 91 | oddByte = srcByte; |
michael@0 | 92 | mHZState |= HZ_STATE_ODD_BYTE_FLAG; |
michael@0 | 93 | } else { |
michael@0 | 94 | *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING : |
michael@0 | 95 | CAST_CHAR_TO_UNICHAR(srcByte); |
michael@0 | 96 | iDestlen++; |
michael@0 | 97 | } |
michael@0 | 98 | } else { |
michael@0 | 99 | if (oddByte & 0x80) { |
michael@0 | 100 | // Accept legal 8-bit GB 2312-80 sequences in GB mode only |
michael@0 | 101 | NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB, |
michael@0 | 102 | "Invalid lead byte in ASCII mode"); |
michael@0 | 103 | *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) && |
michael@0 | 104 | UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ? |
michael@0 | 105 | mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING; |
michael@0 | 106 | mRunLength++; |
michael@0 | 107 | iDestlen++; |
michael@0 | 108 | // otherwise, it is a 7-bit byte |
michael@0 | 109 | // The source will be an ASCII or a 7-bit HZ code depending on oddByte |
michael@0 | 110 | } else if (oddByte == HZLEAD1) { // if it is lead by '~' |
michael@0 | 111 | switch (srcByte) { |
michael@0 | 112 | case HZLEAD2: |
michael@0 | 113 | // we got a '~{' |
michael@0 | 114 | // we are switching to HZ state |
michael@0 | 115 | mHZState = HZ_STATE_GB; |
michael@0 | 116 | mRunLength = 0; |
michael@0 | 117 | break; |
michael@0 | 118 | |
michael@0 | 119 | case HZLEAD3: |
michael@0 | 120 | // we got a '~}' |
michael@0 | 121 | // we are switching to ASCII state |
michael@0 | 122 | mHZState = HZ_STATE_ASCII; |
michael@0 | 123 | if (mRunLength == 0) { |
michael@0 | 124 | *aDest++ = UCS2_NO_MAPPING; |
michael@0 | 125 | iDestlen++; |
michael@0 | 126 | } |
michael@0 | 127 | mRunLength = 0; |
michael@0 | 128 | break; |
michael@0 | 129 | |
michael@0 | 130 | case HZLEAD1: |
michael@0 | 131 | // we got a '~~', process like an ASCII, but no state change |
michael@0 | 132 | *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte); |
michael@0 | 133 | iDestlen++; |
michael@0 | 134 | mRunLength++; |
michael@0 | 135 | break; |
michael@0 | 136 | |
michael@0 | 137 | default: |
michael@0 | 138 | // Undefined ESC sequence '~X': treat as an error if X is a |
michael@0 | 139 | // printable character or we are in ASCII mode, and resynchronize |
michael@0 | 140 | // on the second character. |
michael@0 | 141 | // |
michael@0 | 142 | // N.B. For compatibility with other implementations, we treat '~\n' |
michael@0 | 143 | // as an illegal sequence even though RFC1843 permits it, and for |
michael@0 | 144 | // the same reason we pass through control characters including '\n' |
michael@0 | 145 | // and ' ' even in GB mode. |
michael@0 | 146 | if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) { |
michael@0 | 147 | *aDest++ = UCS2_NO_MAPPING; |
michael@0 | 148 | iDestlen++; |
michael@0 | 149 | } |
michael@0 | 150 | aSrc--; |
michael@0 | 151 | (*aSrcLength)--; |
michael@0 | 152 | i--; |
michael@0 | 153 | break; |
michael@0 | 154 | } |
michael@0 | 155 | } else if (HZ_ENCODING_STATE == HZ_STATE_GB) { |
michael@0 | 156 | *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) && |
michael@0 | 157 | UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ? |
michael@0 | 158 | mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) : |
michael@0 | 159 | UCS2_NO_MAPPING; |
michael@0 | 160 | mRunLength++; |
michael@0 | 161 | iDestlen++; |
michael@0 | 162 | } else { |
michael@0 | 163 | NS_NOTREACHED("2-byte sequence that we don't know how to handle"); |
michael@0 | 164 | *aDest++ = UCS2_NO_MAPPING; |
michael@0 | 165 | iDestlen++; |
michael@0 | 166 | } |
michael@0 | 167 | oddByte = 0; |
michael@0 | 168 | mHZState &= ~HZ_STATE_ODD_BYTE_FLAG; |
michael@0 | 169 | } |
michael@0 | 170 | } // for loop |
michael@0 | 171 | mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0; |
michael@0 | 172 | *aDestLength = iDestlen; |
michael@0 | 173 | return res; |
michael@0 | 174 | } |
michael@0 | 175 | |
michael@0 | 176 |