michael@0: /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef nsHebrewProber_h__ michael@0: #define nsHebrewProber_h__ michael@0: michael@0: #include "nsSBCharSetProber.h" michael@0: michael@0: // This prober doesn't actually recognize a language or a charset. michael@0: // It is a helper prober for the use of the Hebrew model probers michael@0: class nsHebrewProber: public nsCharSetProber michael@0: { michael@0: public: michael@0: nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); } michael@0: michael@0: virtual ~nsHebrewProber(void) {} michael@0: virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); michael@0: virtual const char* GetCharSetName(); michael@0: virtual void Reset(void); michael@0: michael@0: virtual nsProbingState GetState(void); michael@0: michael@0: virtual float GetConfidence(void) { return (float)0.0; } michael@0: michael@0: void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) michael@0: { mLogicalProb = logicalPrb; mVisualProb = visualPrb; } michael@0: michael@0: #ifdef DEBUG_chardet michael@0: virtual void DumpStatus(); michael@0: #endif michael@0: michael@0: protected: michael@0: static bool isFinal(char c); michael@0: static bool isNonFinal(char c); michael@0: michael@0: int32_t mFinalCharLogicalScore, mFinalCharVisualScore; michael@0: michael@0: // The two last characters seen in the previous buffer. michael@0: char mPrev, mBeforePrev; michael@0: michael@0: // These probers are owned by the group prober. michael@0: nsCharSetProber *mLogicalProb, *mVisualProb; michael@0: }; michael@0: michael@0: /** michael@0: * ** General ideas of the Hebrew charset recognition ** michael@0: * michael@0: * Four main charsets exist in Hebrew: michael@0: * "ISO-8859-8" - Visual Hebrew michael@0: * "windows-1255" - Logical Hebrew michael@0: * "ISO-8859-8-I" - Logical Hebrew michael@0: * "x-mac-hebrew" - ?? Logical Hebrew ?? michael@0: * michael@0: * Both "ISO" charsets use a completely identical set of code points, whereas michael@0: * "windows-1255" and "x-mac-hebrew" are two different proper supersets of michael@0: * these code points. windows-1255 defines additional characters in the range michael@0: * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific michael@0: * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. michael@0: * x-mac-hebrew defines similar additional code points but with a different michael@0: * mapping. michael@0: * michael@0: * As far as an average Hebrew text with no diacritics is concerned, all four michael@0: * charsets are identical with respect to code points. Meaning that for the michael@0: * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters michael@0: * (including final letters). michael@0: * michael@0: * The dominant difference between these charsets is their directionality. michael@0: * "Visual" directionality means that the text is ordered as if the renderer is michael@0: * not aware of a BIDI rendering algorithm. The renderer sees the text and michael@0: * draws it from left to right. The text itself when ordered naturally is read michael@0: * backwards. A buffer of Visual Hebrew generally looks like so: michael@0: * "[last word of first line spelled backwards] [whole line ordered backwards michael@0: * and spelled backwards] [first word of first line spelled backwards] michael@0: * [end of line] [last word of second line] ... etc' " michael@0: * adding punctuation marks, numbers and English text to visual text is michael@0: * naturally also "visual" and from left to right. michael@0: * michael@0: * "Logical" directionality means the text is ordered "naturally" according to michael@0: * the order it is read. It is the responsibility of the renderer to display michael@0: * the text from right to left. A BIDI algorithm is used to place general michael@0: * punctuation marks, numbers and English text in the text. michael@0: * michael@0: * Texts in x-mac-hebrew are almost impossible to find on the Internet. From michael@0: * what little evidence I could find, it seems that its general directionality michael@0: * is Logical. michael@0: * michael@0: * To sum up all of the above, the Hebrew probing mechanism knows about two michael@0: * charsets: michael@0: * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are michael@0: * backwards while line order is natural. For charset recognition purposes michael@0: * the line order is unimportant (In fact, for this implementation, even michael@0: * word order is unimportant). michael@0: * Logical Hebrew - "windows-1255" - normal, naturally ordered text. michael@0: * michael@0: * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be michael@0: * specifically identified. michael@0: * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew michael@0: * that contain special punctuation marks or diacritics is displayed with michael@0: * some unconverted characters showing as question marks. This problem might michael@0: * be corrected using another model prober for x-mac-hebrew. Due to the fact michael@0: * that x-mac-hebrew texts are so rare, writing another model prober isn't michael@0: * worth the effort and performance hit. michael@0: * michael@0: * *** The Prober *** michael@0: * michael@0: * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber, michael@0: * all of which are managed, created, fed data, inquired and deleted by the michael@0: * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in michael@0: * fact some kind of Hebrew, Logical or Visual. The final decision about which michael@0: * one is it is made by the nsHebrewProber by combining final-letter scores michael@0: * with the scores of the two nsSBCharSetProbers to produce a final answer. michael@0: * michael@0: * The nsSBCSGroupProber is responsible for stripping the original text of HTML michael@0: * tags, English characters, numbers, low-ASCII punctuation characters, spaces michael@0: * and new lines. It reduces any sequence of such characters to a single space. michael@0: * The buffer fed to each prober in the SBCS group prober is pure text in michael@0: * high-ASCII. michael@0: * The two nsSBCharSetProbers (model probers) share the same language model: michael@0: * Win1255Model. michael@0: * The first nsSBCharSetProber uses the model normally as any other michael@0: * nsSBCharSetProber does, to recognize windows-1255, upon which this model was michael@0: * built. The second nsSBCharSetProber is told to make the pair-of-letter michael@0: * lookup in the language model backwards. This in practice exactly simulates michael@0: * a visual Hebrew model using the windows-1255 logical Hebrew model. michael@0: * michael@0: * The nsHebrewProber is not using any language model. All it does is look for michael@0: * final-letter evidence suggesting the text is either logical Hebrew or visual michael@0: * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber michael@0: * alone are meaningless. nsHebrewProber always returns 0.00 as confidence michael@0: * since it never identifies a charset by itself. Instead, the pointer to the michael@0: * nsHebrewProber is passed to the model probers as a helper "Name Prober". michael@0: * When the Group prober receives a positive identification from any prober, michael@0: * it asks for the name of the charset identified. If the prober queried is a michael@0: * Hebrew model prober, the model prober forwards the call to the michael@0: * nsHebrewProber to make the final decision. In the nsHebrewProber, the michael@0: * decision is made according to the final-letters scores maintained and Both michael@0: * model probers scores. The answer is returned in the form of the name of the michael@0: * charset identified, either "windows-1255" or "ISO-8859-8". michael@0: * michael@0: */ michael@0: #endif /* nsHebrewProber_h__ */