Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2005-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | */ |
michael@0 | 7 | |
michael@0 | 8 | #include "unicode/utypes.h" |
michael@0 | 9 | |
michael@0 | 10 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 11 | |
michael@0 | 12 | #include "csrucode.h" |
michael@0 | 13 | #include "csmatch.h" |
michael@0 | 14 | |
michael@0 | 15 | U_NAMESPACE_BEGIN |
michael@0 | 16 | |
michael@0 | 17 | CharsetRecog_Unicode::~CharsetRecog_Unicode() |
michael@0 | 18 | { |
michael@0 | 19 | // nothing to do |
michael@0 | 20 | } |
michael@0 | 21 | |
michael@0 | 22 | CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() |
michael@0 | 23 | { |
michael@0 | 24 | // nothing to do |
michael@0 | 25 | } |
michael@0 | 26 | |
michael@0 | 27 | const char *CharsetRecog_UTF_16_BE::getName() const |
michael@0 | 28 | { |
michael@0 | 29 | return "UTF-16BE"; |
michael@0 | 30 | } |
michael@0 | 31 | |
michael@0 | 32 | UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const |
michael@0 | 33 | { |
michael@0 | 34 | const uint8_t *input = textIn->fRawInput; |
michael@0 | 35 | int32_t confidence = 0; |
michael@0 | 36 | int32_t length = textIn->fRawLength; |
michael@0 | 37 | |
michael@0 | 38 | if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { |
michael@0 | 39 | confidence = 100; |
michael@0 | 40 | } |
michael@0 | 41 | |
michael@0 | 42 | // TODO: Do some statastics to check for unsigned UTF-16BE |
michael@0 | 43 | results->set(textIn, this, confidence); |
michael@0 | 44 | return (confidence > 0); |
michael@0 | 45 | } |
michael@0 | 46 | |
michael@0 | 47 | CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() |
michael@0 | 48 | { |
michael@0 | 49 | // nothing to do |
michael@0 | 50 | } |
michael@0 | 51 | |
michael@0 | 52 | const char *CharsetRecog_UTF_16_LE::getName() const |
michael@0 | 53 | { |
michael@0 | 54 | return "UTF-16LE"; |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const |
michael@0 | 58 | { |
michael@0 | 59 | const uint8_t *input = textIn->fRawInput; |
michael@0 | 60 | int32_t confidence = 0; |
michael@0 | 61 | int32_t length = textIn->fRawLength; |
michael@0 | 62 | |
michael@0 | 63 | if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { |
michael@0 | 64 | confidence = 100; |
michael@0 | 65 | } |
michael@0 | 66 | |
michael@0 | 67 | // TODO: Do some statastics to check for unsigned UTF-16LE |
michael@0 | 68 | results->set(textIn, this, confidence); |
michael@0 | 69 | return (confidence > 0); |
michael@0 | 70 | } |
michael@0 | 71 | |
michael@0 | 72 | CharsetRecog_UTF_32::~CharsetRecog_UTF_32() |
michael@0 | 73 | { |
michael@0 | 74 | // nothing to do |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const |
michael@0 | 78 | { |
michael@0 | 79 | const uint8_t *input = textIn->fRawInput; |
michael@0 | 80 | int32_t limit = (textIn->fRawLength / 4) * 4; |
michael@0 | 81 | int32_t numValid = 0; |
michael@0 | 82 | int32_t numInvalid = 0; |
michael@0 | 83 | bool hasBOM = FALSE; |
michael@0 | 84 | int32_t confidence = 0; |
michael@0 | 85 | |
michael@0 | 86 | if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { |
michael@0 | 87 | hasBOM = TRUE; |
michael@0 | 88 | } |
michael@0 | 89 | |
michael@0 | 90 | for(int32_t i = 0; i < limit; i += 4) { |
michael@0 | 91 | int32_t ch = getChar(input, i); |
michael@0 | 92 | |
michael@0 | 93 | if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { |
michael@0 | 94 | numInvalid += 1; |
michael@0 | 95 | } else { |
michael@0 | 96 | numValid += 1; |
michael@0 | 97 | } |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | |
michael@0 | 101 | // Cook up some sort of confidence score, based on presense of a BOM |
michael@0 | 102 | // and the existence of valid and/or invalid multi-byte sequences. |
michael@0 | 103 | if (hasBOM && numInvalid==0) { |
michael@0 | 104 | confidence = 100; |
michael@0 | 105 | } else if (hasBOM && numValid > numInvalid*10) { |
michael@0 | 106 | confidence = 80; |
michael@0 | 107 | } else if (numValid > 3 && numInvalid == 0) { |
michael@0 | 108 | confidence = 100; |
michael@0 | 109 | } else if (numValid > 0 && numInvalid == 0) { |
michael@0 | 110 | confidence = 80; |
michael@0 | 111 | } else if (numValid > numInvalid*10) { |
michael@0 | 112 | // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. |
michael@0 | 113 | confidence = 25; |
michael@0 | 114 | } |
michael@0 | 115 | |
michael@0 | 116 | results->set(textIn, this, confidence); |
michael@0 | 117 | return (confidence > 0); |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() |
michael@0 | 121 | { |
michael@0 | 122 | // nothing to do |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | const char *CharsetRecog_UTF_32_BE::getName() const |
michael@0 | 126 | { |
michael@0 | 127 | return "UTF-32BE"; |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const |
michael@0 | 131 | { |
michael@0 | 132 | return input[index + 0] << 24 | input[index + 1] << 16 | |
michael@0 | 133 | input[index + 2] << 8 | input[index + 3]; |
michael@0 | 134 | } |
michael@0 | 135 | |
michael@0 | 136 | CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() |
michael@0 | 137 | { |
michael@0 | 138 | // nothing to do |
michael@0 | 139 | } |
michael@0 | 140 | |
michael@0 | 141 | const char *CharsetRecog_UTF_32_LE::getName() const |
michael@0 | 142 | { |
michael@0 | 143 | return "UTF-32LE"; |
michael@0 | 144 | } |
michael@0 | 145 | |
michael@0 | 146 | int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const |
michael@0 | 147 | { |
michael@0 | 148 | return input[index + 3] << 24 | input[index + 2] << 16 | |
michael@0 | 149 | input[index + 1] << 8 | input[index + 0]; |
michael@0 | 150 | } |
michael@0 | 151 | |
michael@0 | 152 | U_NAMESPACE_END |
michael@0 | 153 | #endif |
michael@0 | 154 |