|
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsHebrewProber.h" |
|
7 #include <stdio.h> |
|
8 |
|
9 // windows-1255 / ISO-8859-8 code points of interest |
|
10 #define FINAL_KAF ('\xea') |
|
11 #define NORMAL_KAF ('\xeb') |
|
12 #define FINAL_MEM ('\xed') |
|
13 #define NORMAL_MEM ('\xee') |
|
14 #define FINAL_NUN ('\xef') |
|
15 #define NORMAL_NUN ('\xf0') |
|
16 #define FINAL_PE ('\xf3') |
|
17 #define NORMAL_PE ('\xf4') |
|
18 #define FINAL_TSADI ('\xf5') |
|
19 #define NORMAL_TSADI ('\xf6') |
|
20 |
|
21 // Minimum Visual vs Logical final letter score difference. |
|
22 // If the difference is below this, don't rely solely on the final letter score distance. |
|
23 #define MIN_FINAL_CHAR_DISTANCE (5) |
|
24 |
|
25 // Minimum Visual vs Logical model score difference. |
|
26 // If the difference is below this, don't rely at all on the model score distance. |
|
27 #define MIN_MODEL_DISTANCE (0.01) |
|
28 |
|
29 #define VISUAL_HEBREW_NAME ("ISO-8859-8") |
|
30 #define LOGICAL_HEBREW_NAME ("windows-1255") |
|
31 |
|
32 bool nsHebrewProber::isFinal(char c) |
|
33 { |
|
34 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
|
35 } |
|
36 |
|
37 bool nsHebrewProber::isNonFinal(char c) |
|
38 { |
|
39 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
|
40 // The normal Tsadi is not a good Non-Final letter due to words like |
|
41 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
|
42 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
|
43 // the Non-Final tsadi to appear at an end of a word even though this is not |
|
44 // the case in the original text. |
|
45 // The letters Pe and Kaf rarely display a related behavior of not being a |
|
46 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
|
47 // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
|
48 // these letters as Non-Final letters outweighs the damage since these words |
|
49 // are quite rare. |
|
50 } |
|
51 |
|
52 /** HandleData |
|
53 * Final letter analysis for logical-visual decision. |
|
54 * Look for evidence that the received buffer is either logical Hebrew or |
|
55 * visual Hebrew. |
|
56 * The following cases are checked: |
|
57 * 1) A word longer than 1 letter, ending with a final letter. This is an |
|
58 * indication that the text is laid out "naturally" since the final letter |
|
59 * really appears at the end. +1 for logical score. |
|
60 * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
|
61 * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
|
62 * the Non-Final form of that letter. Exceptions to this rule are mentioned |
|
63 * above in isNonFinal(). This is an indication that the text is laid out |
|
64 * backwards. +1 for visual score |
|
65 * 3) A word longer than 1 letter, starting with a final letter. Final letters |
|
66 * should not appear at the beginning of a word. This is an indication that |
|
67 * the text is laid out backwards. +1 for visual score. |
|
68 * |
|
69 * The visual score and logical score are accumulated throughout the text and |
|
70 * are finally checked against each other in GetCharSetName(). |
|
71 * No checking for final letters in the middle of words is done since that case |
|
72 * is not an indication for either Logical or Visual text. |
|
73 * |
|
74 * The input buffer should not contain any white spaces that are not (' ') |
|
75 * or any low-ascii punctuation marks. |
|
76 */ |
|
77 nsProbingState nsHebrewProber::HandleData(const char* aBuf, uint32_t aLen) |
|
78 { |
|
79 // Both model probers say it's not them. No reason to continue. |
|
80 if (GetState() == eNotMe) |
|
81 return eNotMe; |
|
82 |
|
83 const char *curPtr, *endPtr = aBuf+aLen; |
|
84 char cur; |
|
85 |
|
86 for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) |
|
87 { |
|
88 cur = *curPtr; |
|
89 if (cur == ' ') // We stand on a space - a word just ended |
|
90 { |
|
91 if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word |
|
92 { |
|
93 if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] |
|
94 ++mFinalCharLogicalScore; |
|
95 else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
|
96 ++mFinalCharVisualScore; |
|
97 } |
|
98 } |
|
99 else // Not standing on a space |
|
100 { |
|
101 if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] |
|
102 ++mFinalCharVisualScore; |
|
103 } |
|
104 mBeforePrev = mPrev; |
|
105 mPrev = cur; |
|
106 } |
|
107 |
|
108 // Forever detecting, till the end or until both model probers return eNotMe (handled above). |
|
109 return eDetecting; |
|
110 } |
|
111 |
|
112 // Make the decision: is it Logical or Visual? |
|
113 const char* nsHebrewProber::GetCharSetName() |
|
114 { |
|
115 // If the final letter score distance is dominant enough, rely on it. |
|
116 int32_t finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
|
117 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) |
|
118 return LOGICAL_HEBREW_NAME; |
|
119 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) |
|
120 return VISUAL_HEBREW_NAME; |
|
121 |
|
122 // It's not dominant enough, try to rely on the model scores instead. |
|
123 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
|
124 if (modelsub > MIN_MODEL_DISTANCE) |
|
125 return LOGICAL_HEBREW_NAME; |
|
126 if (modelsub < -(MIN_MODEL_DISTANCE)) |
|
127 return VISUAL_HEBREW_NAME; |
|
128 |
|
129 // Still no good, back to final letter distance, maybe it'll save the day. |
|
130 if (finalsub < 0) |
|
131 return VISUAL_HEBREW_NAME; |
|
132 |
|
133 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
|
134 return LOGICAL_HEBREW_NAME; |
|
135 } |
|
136 |
|
137 |
|
138 void nsHebrewProber::Reset(void) |
|
139 { |
|
140 mFinalCharLogicalScore = 0; |
|
141 mFinalCharVisualScore = 0; |
|
142 |
|
143 // mPrev and mBeforePrev are initialized to space in order to simulate a word |
|
144 // delimiter at the beginning of the data |
|
145 mPrev = ' '; |
|
146 mBeforePrev = ' '; |
|
147 } |
|
148 |
|
149 nsProbingState nsHebrewProber::GetState(void) |
|
150 { |
|
151 // Remain active as long as any of the model probers are active. |
|
152 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) |
|
153 return eNotMe; |
|
154 return eDetecting; |
|
155 } |
|
156 |
|
157 #ifdef DEBUG_chardet |
|
158 void nsHebrewProber::DumpStatus() |
|
159 { |
|
160 printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); |
|
161 } |
|
162 #endif |