michael@0: /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsHebrewProber.h" michael@0: #include michael@0: michael@0: // windows-1255 / ISO-8859-8 code points of interest michael@0: #define FINAL_KAF ('\xea') michael@0: #define NORMAL_KAF ('\xeb') michael@0: #define FINAL_MEM ('\xed') michael@0: #define NORMAL_MEM ('\xee') michael@0: #define FINAL_NUN ('\xef') michael@0: #define NORMAL_NUN ('\xf0') michael@0: #define FINAL_PE ('\xf3') michael@0: #define NORMAL_PE ('\xf4') michael@0: #define FINAL_TSADI ('\xf5') michael@0: #define NORMAL_TSADI ('\xf6') michael@0: michael@0: // Minimum Visual vs Logical final letter score difference. michael@0: // If the difference is below this, don't rely solely on the final letter score distance. michael@0: #define MIN_FINAL_CHAR_DISTANCE (5) michael@0: michael@0: // Minimum Visual vs Logical model score difference. michael@0: // If the difference is below this, don't rely at all on the model score distance. michael@0: #define MIN_MODEL_DISTANCE (0.01) michael@0: michael@0: #define VISUAL_HEBREW_NAME ("ISO-8859-8") michael@0: #define LOGICAL_HEBREW_NAME ("windows-1255") michael@0: michael@0: bool nsHebrewProber::isFinal(char c) michael@0: { michael@0: return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); michael@0: } michael@0: michael@0: bool nsHebrewProber::isNonFinal(char c) michael@0: { michael@0: return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); michael@0: // The normal Tsadi is not a good Non-Final letter due to words like michael@0: // 'lechotet' (to chat) containing an apostrophe after the tsadi. This michael@0: // apostrophe is converted to a space in FilterWithoutEnglishLetters causing michael@0: // the Non-Final tsadi to appear at an end of a word even though this is not michael@0: // the case in the original text. michael@0: // The letters Pe and Kaf rarely display a related behavior of not being a michael@0: // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for michael@0: // example legally end with a Non-Final Pe or Kaf. However, the benefit of michael@0: // these letters as Non-Final letters outweighs the damage since these words michael@0: // are quite rare. michael@0: } michael@0: michael@0: /** HandleData michael@0: * Final letter analysis for logical-visual decision. michael@0: * Look for evidence that the received buffer is either logical Hebrew or michael@0: * visual Hebrew. michael@0: * The following cases are checked: michael@0: * 1) A word longer than 1 letter, ending with a final letter. This is an michael@0: * indication that the text is laid out "naturally" since the final letter michael@0: * really appears at the end. +1 for logical score. michael@0: * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal michael@0: * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with michael@0: * the Non-Final form of that letter. Exceptions to this rule are mentioned michael@0: * above in isNonFinal(). This is an indication that the text is laid out michael@0: * backwards. +1 for visual score michael@0: * 3) A word longer than 1 letter, starting with a final letter. Final letters michael@0: * should not appear at the beginning of a word. This is an indication that michael@0: * the text is laid out backwards. +1 for visual score. michael@0: * michael@0: * The visual score and logical score are accumulated throughout the text and michael@0: * are finally checked against each other in GetCharSetName(). michael@0: * No checking for final letters in the middle of words is done since that case michael@0: * is not an indication for either Logical or Visual text. michael@0: * michael@0: * The input buffer should not contain any white spaces that are not (' ') michael@0: * or any low-ascii punctuation marks. michael@0: */ michael@0: nsProbingState nsHebrewProber::HandleData(const char* aBuf, uint32_t aLen) michael@0: { michael@0: // Both model probers say it's not them. No reason to continue. michael@0: if (GetState() == eNotMe) michael@0: return eNotMe; michael@0: michael@0: const char *curPtr, *endPtr = aBuf+aLen; michael@0: char cur; michael@0: michael@0: for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) michael@0: { michael@0: cur = *curPtr; michael@0: if (cur == ' ') // We stand on a space - a word just ended michael@0: { michael@0: if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word michael@0: { michael@0: if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] michael@0: ++mFinalCharLogicalScore; michael@0: else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] michael@0: ++mFinalCharVisualScore; michael@0: } michael@0: } michael@0: else // Not standing on a space michael@0: { michael@0: if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] michael@0: ++mFinalCharVisualScore; michael@0: } michael@0: mBeforePrev = mPrev; michael@0: mPrev = cur; michael@0: } michael@0: michael@0: // Forever detecting, till the end or until both model probers return eNotMe (handled above). michael@0: return eDetecting; michael@0: } michael@0: michael@0: // Make the decision: is it Logical or Visual? michael@0: const char* nsHebrewProber::GetCharSetName() michael@0: { michael@0: // If the final letter score distance is dominant enough, rely on it. michael@0: int32_t finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; michael@0: if (finalsub >= MIN_FINAL_CHAR_DISTANCE) michael@0: return LOGICAL_HEBREW_NAME; michael@0: if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) michael@0: return VISUAL_HEBREW_NAME; michael@0: michael@0: // It's not dominant enough, try to rely on the model scores instead. michael@0: float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); michael@0: if (modelsub > MIN_MODEL_DISTANCE) michael@0: return LOGICAL_HEBREW_NAME; michael@0: if (modelsub < -(MIN_MODEL_DISTANCE)) michael@0: return VISUAL_HEBREW_NAME; michael@0: michael@0: // Still no good, back to final letter distance, maybe it'll save the day. michael@0: if (finalsub < 0) michael@0: return VISUAL_HEBREW_NAME; michael@0: michael@0: // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. michael@0: return LOGICAL_HEBREW_NAME; michael@0: } michael@0: michael@0: michael@0: void nsHebrewProber::Reset(void) michael@0: { michael@0: mFinalCharLogicalScore = 0; michael@0: mFinalCharVisualScore = 0; michael@0: michael@0: // mPrev and mBeforePrev are initialized to space in order to simulate a word michael@0: // delimiter at the beginning of the data michael@0: mPrev = ' '; michael@0: mBeforePrev = ' '; michael@0: } michael@0: michael@0: nsProbingState nsHebrewProber::GetState(void) michael@0: { michael@0: // Remain active as long as any of the model probers are active. michael@0: if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) michael@0: return eNotMe; michael@0: return eDetecting; michael@0: } michael@0: michael@0: #ifdef DEBUG_chardet michael@0: void nsHebrewProber::DumpStatus() michael@0: { michael@0: printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); michael@0: } michael@0: #endif