Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "nsLatin1Prober.h" |
michael@0 | 7 | #include "prmem.h" |
michael@0 | 8 | #include <stdio.h> |
michael@0 | 9 | |
michael@0 | 10 | #define UDF 0 // undefined |
michael@0 | 11 | #define OTH 1 //other |
michael@0 | 12 | #define ASC 2 // ascii capital letter |
michael@0 | 13 | #define ASS 3 // ascii small letter |
michael@0 | 14 | #define ACV 4 // accent capital vowel |
michael@0 | 15 | #define ACO 5 // accent capital other |
michael@0 | 16 | #define ASV 6 // accent small vowel |
michael@0 | 17 | #define ASO 7 // accent small other |
michael@0 | 18 | #define CLASS_NUM 8 // total classes |
michael@0 | 19 | |
michael@0 | 20 | static const unsigned char Latin1_CharToClass[] = |
michael@0 | 21 | { |
michael@0 | 22 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
michael@0 | 23 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
michael@0 | 24 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
michael@0 | 25 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
michael@0 | 26 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
michael@0 | 27 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
michael@0 | 28 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
michael@0 | 29 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
michael@0 | 30 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
michael@0 | 31 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
michael@0 | 32 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
michael@0 | 33 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
michael@0 | 34 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
michael@0 | 35 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
michael@0 | 36 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
michael@0 | 37 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
michael@0 | 38 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
michael@0 | 39 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
michael@0 | 40 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
michael@0 | 41 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
michael@0 | 42 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
michael@0 | 43 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
michael@0 | 44 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
michael@0 | 45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
michael@0 | 46 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
michael@0 | 47 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
michael@0 | 48 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
michael@0 | 49 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
michael@0 | 50 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
michael@0 | 51 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
michael@0 | 52 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
michael@0 | 53 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF |
michael@0 | 54 | }; |
michael@0 | 55 | |
michael@0 | 56 | |
michael@0 | 57 | /* 0 : illegal |
michael@0 | 58 | 1 : very unlikely |
michael@0 | 59 | 2 : normal |
michael@0 | 60 | 3 : very likely |
michael@0 | 61 | */ |
michael@0 | 62 | static const unsigned char Latin1ClassModel[] = |
michael@0 | 63 | { |
michael@0 | 64 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ |
michael@0 | 65 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 66 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, |
michael@0 | 67 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, |
michael@0 | 68 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, |
michael@0 | 69 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, |
michael@0 | 70 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, |
michael@0 | 71 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, |
michael@0 | 72 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, |
michael@0 | 73 | }; |
michael@0 | 74 | |
michael@0 | 75 | void nsLatin1Prober::Reset(void) |
michael@0 | 76 | { |
michael@0 | 77 | mState = eDetecting; |
michael@0 | 78 | mLastCharClass = OTH; |
michael@0 | 79 | for (int i = 0; i < FREQ_CAT_NUM; i++) |
michael@0 | 80 | mFreqCounter[i] = 0; |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | |
michael@0 | 84 | nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen) |
michael@0 | 85 | { |
michael@0 | 86 | char *newBuf1 = 0; |
michael@0 | 87 | uint32_t newLen1 = 0; |
michael@0 | 88 | |
michael@0 | 89 | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { |
michael@0 | 90 | newBuf1 = (char*)aBuf; |
michael@0 | 91 | newLen1 = aLen; |
michael@0 | 92 | } |
michael@0 | 93 | |
michael@0 | 94 | unsigned char charClass; |
michael@0 | 95 | unsigned char freq; |
michael@0 | 96 | for (uint32_t i = 0; i < newLen1; i++) |
michael@0 | 97 | { |
michael@0 | 98 | charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; |
michael@0 | 99 | freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; |
michael@0 | 100 | if (freq == 0) { |
michael@0 | 101 | mState = eNotMe; |
michael@0 | 102 | break; |
michael@0 | 103 | } |
michael@0 | 104 | mFreqCounter[freq]++; |
michael@0 | 105 | mLastCharClass = charClass; |
michael@0 | 106 | } |
michael@0 | 107 | |
michael@0 | 108 | if (newBuf1 != aBuf) |
michael@0 | 109 | PR_FREEIF(newBuf1); |
michael@0 | 110 | |
michael@0 | 111 | return mState; |
michael@0 | 112 | } |
michael@0 | 113 | |
michael@0 | 114 | float nsLatin1Prober::GetConfidence(void) |
michael@0 | 115 | { |
michael@0 | 116 | if (mState == eNotMe) |
michael@0 | 117 | return 0.01f; |
michael@0 | 118 | |
michael@0 | 119 | float confidence; |
michael@0 | 120 | uint32_t total = 0; |
michael@0 | 121 | for (int32_t i = 0; i < FREQ_CAT_NUM; i++) |
michael@0 | 122 | total += mFreqCounter[i]; |
michael@0 | 123 | |
michael@0 | 124 | if(!total) |
michael@0 | 125 | confidence = 0.0f; |
michael@0 | 126 | else |
michael@0 | 127 | { |
michael@0 | 128 | confidence = mFreqCounter[3]*1.0f / total; |
michael@0 | 129 | confidence -= mFreqCounter[1]*20.0f/total; |
michael@0 | 130 | } |
michael@0 | 131 | |
michael@0 | 132 | if (confidence < 0.0f) |
michael@0 | 133 | confidence = 0.0f; |
michael@0 | 134 | |
michael@0 | 135 | // lower the confidence of latin1 so that other more accurate detector |
michael@0 | 136 | // can take priority. |
michael@0 | 137 | confidence *= 0.50f; |
michael@0 | 138 | |
michael@0 | 139 | return confidence; |
michael@0 | 140 | } |
michael@0 | 141 | |
michael@0 | 142 | #ifdef DEBUG_chardet |
michael@0 | 143 | void nsLatin1Prober::DumpStatus() |
michael@0 | 144 | { |
michael@0 | 145 | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
michael@0 | 146 | } |
michael@0 | 147 | #endif |
michael@0 | 148 | |
michael@0 | 149 |