1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsHebrewProber.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,143 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef nsHebrewProber_h__ 1.10 +#define nsHebrewProber_h__ 1.11 + 1.12 +#include "nsSBCharSetProber.h" 1.13 + 1.14 +// This prober doesn't actually recognize a language or a charset. 1.15 +// It is a helper prober for the use of the Hebrew model probers 1.16 +class nsHebrewProber: public nsCharSetProber 1.17 +{ 1.18 +public: 1.19 + nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); } 1.20 + 1.21 + virtual ~nsHebrewProber(void) {} 1.22 + virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); 1.23 + virtual const char* GetCharSetName(); 1.24 + virtual void Reset(void); 1.25 + 1.26 + virtual nsProbingState GetState(void); 1.27 + 1.28 + virtual float GetConfidence(void) { return (float)0.0; } 1.29 + 1.30 + void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) 1.31 + { mLogicalProb = logicalPrb; mVisualProb = visualPrb; } 1.32 + 1.33 +#ifdef DEBUG_chardet 1.34 + virtual void DumpStatus(); 1.35 +#endif 1.36 + 1.37 +protected: 1.38 + static bool isFinal(char c); 1.39 + static bool isNonFinal(char c); 1.40 + 1.41 + int32_t mFinalCharLogicalScore, mFinalCharVisualScore; 1.42 + 1.43 + // The two last characters seen in the previous buffer. 1.44 + char mPrev, mBeforePrev; 1.45 + 1.46 + // These probers are owned by the group prober. 1.47 + nsCharSetProber *mLogicalProb, *mVisualProb; 1.48 +}; 1.49 + 1.50 +/** 1.51 + * ** General ideas of the Hebrew charset recognition ** 1.52 + * 1.53 + * Four main charsets exist in Hebrew: 1.54 + * "ISO-8859-8" - Visual Hebrew 1.55 + * "windows-1255" - Logical Hebrew 1.56 + * "ISO-8859-8-I" - Logical Hebrew 1.57 + * "x-mac-hebrew" - ?? Logical Hebrew ?? 1.58 + * 1.59 + * Both "ISO" charsets use a completely identical set of code points, whereas 1.60 + * "windows-1255" and "x-mac-hebrew" are two different proper supersets of 1.61 + * these code points. windows-1255 defines additional characters in the range 1.62 + * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific 1.63 + * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. 1.64 + * x-mac-hebrew defines similar additional code points but with a different 1.65 + * mapping. 1.66 + * 1.67 + * As far as an average Hebrew text with no diacritics is concerned, all four 1.68 + * charsets are identical with respect to code points. Meaning that for the 1.69 + * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters 1.70 + * (including final letters). 1.71 + * 1.72 + * The dominant difference between these charsets is their directionality. 1.73 + * "Visual" directionality means that the text is ordered as if the renderer is 1.74 + * not aware of a BIDI rendering algorithm. The renderer sees the text and 1.75 + * draws it from left to right. The text itself when ordered naturally is read 1.76 + * backwards. A buffer of Visual Hebrew generally looks like so: 1.77 + * "[last word of first line spelled backwards] [whole line ordered backwards 1.78 + * and spelled backwards] [first word of first line spelled backwards] 1.79 + * [end of line] [last word of second line] ... etc' " 1.80 + * adding punctuation marks, numbers and English text to visual text is 1.81 + * naturally also "visual" and from left to right. 1.82 + * 1.83 + * "Logical" directionality means the text is ordered "naturally" according to 1.84 + * the order it is read. It is the responsibility of the renderer to display 1.85 + * the text from right to left. A BIDI algorithm is used to place general 1.86 + * punctuation marks, numbers and English text in the text. 1.87 + * 1.88 + * Texts in x-mac-hebrew are almost impossible to find on the Internet. From 1.89 + * what little evidence I could find, it seems that its general directionality 1.90 + * is Logical. 1.91 + * 1.92 + * To sum up all of the above, the Hebrew probing mechanism knows about two 1.93 + * charsets: 1.94 + * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are 1.95 + * backwards while line order is natural. For charset recognition purposes 1.96 + * the line order is unimportant (In fact, for this implementation, even 1.97 + * word order is unimportant). 1.98 + * Logical Hebrew - "windows-1255" - normal, naturally ordered text. 1.99 + * 1.100 + * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be 1.101 + * specifically identified. 1.102 + * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew 1.103 + * that contain special punctuation marks or diacritics is displayed with 1.104 + * some unconverted characters showing as question marks. This problem might 1.105 + * be corrected using another model prober for x-mac-hebrew. Due to the fact 1.106 + * that x-mac-hebrew texts are so rare, writing another model prober isn't 1.107 + * worth the effort and performance hit. 1.108 + * 1.109 + * *** The Prober *** 1.110 + * 1.111 + * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber, 1.112 + * all of which are managed, created, fed data, inquired and deleted by the 1.113 + * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in 1.114 + * fact some kind of Hebrew, Logical or Visual. The final decision about which 1.115 + * one is it is made by the nsHebrewProber by combining final-letter scores 1.116 + * with the scores of the two nsSBCharSetProbers to produce a final answer. 1.117 + * 1.118 + * The nsSBCSGroupProber is responsible for stripping the original text of HTML 1.119 + * tags, English characters, numbers, low-ASCII punctuation characters, spaces 1.120 + * and new lines. It reduces any sequence of such characters to a single space. 1.121 + * The buffer fed to each prober in the SBCS group prober is pure text in 1.122 + * high-ASCII. 1.123 + * The two nsSBCharSetProbers (model probers) share the same language model: 1.124 + * Win1255Model. 1.125 + * The first nsSBCharSetProber uses the model normally as any other 1.126 + * nsSBCharSetProber does, to recognize windows-1255, upon which this model was 1.127 + * built. The second nsSBCharSetProber is told to make the pair-of-letter 1.128 + * lookup in the language model backwards. This in practice exactly simulates 1.129 + * a visual Hebrew model using the windows-1255 logical Hebrew model. 1.130 + * 1.131 + * The nsHebrewProber is not using any language model. All it does is look for 1.132 + * final-letter evidence suggesting the text is either logical Hebrew or visual 1.133 + * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber 1.134 + * alone are meaningless. nsHebrewProber always returns 0.00 as confidence 1.135 + * since it never identifies a charset by itself. Instead, the pointer to the 1.136 + * nsHebrewProber is passed to the model probers as a helper "Name Prober". 1.137 + * When the Group prober receives a positive identification from any prober, 1.138 + * it asks for the name of the charset identified. If the prober queried is a 1.139 + * Hebrew model prober, the model prober forwards the call to the 1.140 + * nsHebrewProber to make the final decision. In the nsHebrewProber, the 1.141 + * decision is made according to the final-letters scores maintained and Both 1.142 + * model probers scores. The answer is returned in the form of the name of the 1.143 + * charset identified, either "windows-1255" or "ISO-8859-8". 1.144 + * 1.145 + */ 1.146 +#endif /* nsHebrewProber_h__ */