Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "nsHebrewProber.h" |
michael@0 | 7 | #include <stdio.h> |
michael@0 | 8 | |
michael@0 | 9 | // windows-1255 / ISO-8859-8 code points of interest |
michael@0 | 10 | #define FINAL_KAF ('\xea') |
michael@0 | 11 | #define NORMAL_KAF ('\xeb') |
michael@0 | 12 | #define FINAL_MEM ('\xed') |
michael@0 | 13 | #define NORMAL_MEM ('\xee') |
michael@0 | 14 | #define FINAL_NUN ('\xef') |
michael@0 | 15 | #define NORMAL_NUN ('\xf0') |
michael@0 | 16 | #define FINAL_PE ('\xf3') |
michael@0 | 17 | #define NORMAL_PE ('\xf4') |
michael@0 | 18 | #define FINAL_TSADI ('\xf5') |
michael@0 | 19 | #define NORMAL_TSADI ('\xf6') |
michael@0 | 20 | |
michael@0 | 21 | // Minimum Visual vs Logical final letter score difference. |
michael@0 | 22 | // If the difference is below this, don't rely solely on the final letter score distance. |
michael@0 | 23 | #define MIN_FINAL_CHAR_DISTANCE (5) |
michael@0 | 24 | |
michael@0 | 25 | // Minimum Visual vs Logical model score difference. |
michael@0 | 26 | // If the difference is below this, don't rely at all on the model score distance. |
michael@0 | 27 | #define MIN_MODEL_DISTANCE (0.01) |
michael@0 | 28 | |
michael@0 | 29 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") |
michael@0 | 30 | #define LOGICAL_HEBREW_NAME ("windows-1255") |
michael@0 | 31 | |
michael@0 | 32 | bool nsHebrewProber::isFinal(char c) |
michael@0 | 33 | { |
michael@0 | 34 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
michael@0 | 35 | } |
michael@0 | 36 | |
michael@0 | 37 | bool nsHebrewProber::isNonFinal(char c) |
michael@0 | 38 | { |
michael@0 | 39 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
michael@0 | 40 | // The normal Tsadi is not a good Non-Final letter due to words like |
michael@0 | 41 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
michael@0 | 42 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
michael@0 | 43 | // the Non-Final tsadi to appear at an end of a word even though this is not |
michael@0 | 44 | // the case in the original text. |
michael@0 | 45 | // The letters Pe and Kaf rarely display a related behavior of not being a |
michael@0 | 46 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
michael@0 | 47 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
michael@0 | 48 | // these letters as Non-Final letters outweighs the damage since these words |
michael@0 | 49 | // are quite rare. |
michael@0 | 50 | } |
michael@0 | 51 | |
michael@0 | 52 | /** HandleData |
michael@0 | 53 | * Final letter analysis for logical-visual decision. |
michael@0 | 54 | * Look for evidence that the received buffer is either logical Hebrew or |
michael@0 | 55 | * visual Hebrew. |
michael@0 | 56 | * The following cases are checked: |
michael@0 | 57 | * 1) A word longer than 1 letter, ending with a final letter. This is an |
michael@0 | 58 | * indication that the text is laid out "naturally" since the final letter |
michael@0 | 59 | * really appears at the end. +1 for logical score. |
michael@0 | 60 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
michael@0 | 61 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
michael@0 | 62 | * the Non-Final form of that letter. Exceptions to this rule are mentioned |
michael@0 | 63 | * above in isNonFinal(). This is an indication that the text is laid out |
michael@0 | 64 | * backwards. +1 for visual score |
michael@0 | 65 | * 3) A word longer than 1 letter, starting with a final letter. Final letters |
michael@0 | 66 | * should not appear at the beginning of a word. This is an indication that |
michael@0 | 67 | * the text is laid out backwards. +1 for visual score. |
michael@0 | 68 | * |
michael@0 | 69 | * The visual score and logical score are accumulated throughout the text and |
michael@0 | 70 | * are finally checked against each other in GetCharSetName(). |
michael@0 | 71 | * No checking for final letters in the middle of words is done since that case |
michael@0 | 72 | * is not an indication for either Logical or Visual text. |
michael@0 | 73 | * |
michael@0 | 74 | * The input buffer should not contain any white spaces that are not (' ') |
michael@0 | 75 | * or any low-ascii punctuation marks. |
michael@0 | 76 | */ |
michael@0 | 77 | nsProbingState nsHebrewProber::HandleData(const char* aBuf, uint32_t aLen) |
michael@0 | 78 | { |
michael@0 | 79 | // Both model probers say it's not them. No reason to continue. |
michael@0 | 80 | if (GetState() == eNotMe) |
michael@0 | 81 | return eNotMe; |
michael@0 | 82 | |
michael@0 | 83 | const char *curPtr, *endPtr = aBuf+aLen; |
michael@0 | 84 | char cur; |
michael@0 | 85 | |
michael@0 | 86 | for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) |
michael@0 | 87 | { |
michael@0 | 88 | cur = *curPtr; |
michael@0 | 89 | if (cur == ' ') // We stand on a space - a word just ended |
michael@0 | 90 | { |
michael@0 | 91 | if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word |
michael@0 | 92 | { |
michael@0 | 93 | if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] |
michael@0 | 94 | ++mFinalCharLogicalScore; |
michael@0 | 95 | else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
michael@0 | 96 | ++mFinalCharVisualScore; |
michael@0 | 97 | } |
michael@0 | 98 | } |
michael@0 | 99 | else // Not standing on a space |
michael@0 | 100 | { |
michael@0 | 101 | if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] |
michael@0 | 102 | ++mFinalCharVisualScore; |
michael@0 | 103 | } |
michael@0 | 104 | mBeforePrev = mPrev; |
michael@0 | 105 | mPrev = cur; |
michael@0 | 106 | } |
michael@0 | 107 | |
michael@0 | 108 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). |
michael@0 | 109 | return eDetecting; |
michael@0 | 110 | } |
michael@0 | 111 | |
michael@0 | 112 | // Make the decision: is it Logical or Visual? |
michael@0 | 113 | const char* nsHebrewProber::GetCharSetName() |
michael@0 | 114 | { |
michael@0 | 115 | // If the final letter score distance is dominant enough, rely on it. |
michael@0 | 116 | int32_t finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
michael@0 | 117 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) |
michael@0 | 118 | return LOGICAL_HEBREW_NAME; |
michael@0 | 119 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) |
michael@0 | 120 | return VISUAL_HEBREW_NAME; |
michael@0 | 121 | |
michael@0 | 122 | // It's not dominant enough, try to rely on the model scores instead. |
michael@0 | 123 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
michael@0 | 124 | if (modelsub > MIN_MODEL_DISTANCE) |
michael@0 | 125 | return LOGICAL_HEBREW_NAME; |
michael@0 | 126 | if (modelsub < -(MIN_MODEL_DISTANCE)) |
michael@0 | 127 | return VISUAL_HEBREW_NAME; |
michael@0 | 128 | |
michael@0 | 129 | // Still no good, back to final letter distance, maybe it'll save the day. |
michael@0 | 130 | if (finalsub < 0) |
michael@0 | 131 | return VISUAL_HEBREW_NAME; |
michael@0 | 132 | |
michael@0 | 133 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
michael@0 | 134 | return LOGICAL_HEBREW_NAME; |
michael@0 | 135 | } |
michael@0 | 136 | |
michael@0 | 137 | |
michael@0 | 138 | void nsHebrewProber::Reset(void) |
michael@0 | 139 | { |
michael@0 | 140 | mFinalCharLogicalScore = 0; |
michael@0 | 141 | mFinalCharVisualScore = 0; |
michael@0 | 142 | |
michael@0 | 143 | // mPrev and mBeforePrev are initialized to space in order to simulate a word |
michael@0 | 144 | // delimiter at the beginning of the data |
michael@0 | 145 | mPrev = ' '; |
michael@0 | 146 | mBeforePrev = ' '; |
michael@0 | 147 | } |
michael@0 | 148 | |
michael@0 | 149 | nsProbingState nsHebrewProber::GetState(void) |
michael@0 | 150 | { |
michael@0 | 151 | // Remain active as long as any of the model probers are active. |
michael@0 | 152 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) |
michael@0 | 153 | return eNotMe; |
michael@0 | 154 | return eDetecting; |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | #ifdef DEBUG_chardet |
michael@0 | 158 | void nsHebrewProber::DumpStatus() |
michael@0 | 159 | { |
michael@0 | 160 | printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); |
michael@0 | 161 | } |
michael@0 | 162 | #endif |