Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | #ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_ |
michael@0 | 20 | #define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_ |
michael@0 | 21 | |
michael@0 | 22 | #include <vector> |
michael@0 | 23 | |
michael@0 | 24 | #include "../public/compact_lang_det.h" // For CLDHints, ResultChunkVector |
michael@0 | 25 | #include "integral_types.h" |
michael@0 | 26 | #include "lang_script.h" |
michael@0 | 27 | |
michael@0 | 28 | namespace CLD2 { |
michael@0 | 29 | |
michael@0 | 30 | // Internal use flags |
michael@0 | 31 | static const int kCLDFlagFinish = 1; |
michael@0 | 32 | static const int kCLDFlagSqueeze = 2; |
michael@0 | 33 | static const int kCLDFlagRepeats = 4; |
michael@0 | 34 | static const int kCLDFlagTop40 = 8; |
michael@0 | 35 | static const int kCLDFlagShort = 16; |
michael@0 | 36 | static const int kCLDFlagHint = 32; |
michael@0 | 37 | static const int kCLDFlagUseWords = 64; |
michael@0 | 38 | static const int kCLDFlagUNUSED = 128; |
michael@0 | 39 | |
michael@0 | 40 | // Public use flags, debug output controls, defined in compact_lang_det.h |
michael@0 | 41 | // 0x0100 and above |
michael@0 | 42 | |
michael@0 | 43 | /*** |
michael@0 | 44 | |
michael@0 | 45 | Flag meanings: |
michael@0 | 46 | |
michael@0 | 47 | Flags are used in the context of a recursive call from Detect to itself, |
michael@0 | 48 | trying to deal in a more restrictive way with input that was not reliably |
michael@0 | 49 | identified in the top-level call. |
michael@0 | 50 | |
michael@0 | 51 | Finish -- Do not further recurse; return whatever result ensues, even if it is |
michael@0 | 52 | unreliable. Typically set in any recursive call to take a second try |
michael@0 | 53 | on unreliable text. |
michael@0 | 54 | |
michael@0 | 55 | Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of |
michael@0 | 56 | highly repetitive text and chunks of text with too many 1- and |
michael@0 | 57 | 2-letter words. This avoids scoring repetitive or useless non-text |
michael@0 | 58 | crap in large files such bogus JPEGs within an HTML file. |
michael@0 | 59 | |
michael@0 | 60 | Repeats -- When scoring a text run, do a cheap prediction of each character |
michael@0 | 61 | and do not score a unigram/quadgram if the last character of same is |
michael@0 | 62 | correctly predicted. This is a slower, finer-grained form of |
michael@0 | 63 | cheapsqueeze, typically used when the first pass got unreliable |
michael@0 | 64 | results. |
michael@0 | 65 | |
michael@0 | 66 | Top40 -- Restrict the set of scored languages to the Google "Top 40", which is |
michael@0 | 67 | actually 38 languages. This gets rid of about 110 languages that |
michael@0 | 68 | represent about 0.7% of the web. Typically used when the first pass |
michael@0 | 69 | got unreliable results. |
michael@0 | 70 | |
michael@0 | 71 | Short -- DEPRICATED, unused |
michael@0 | 72 | |
michael@0 | 73 | Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language |
michael@0 | 74 | hint supplied in parameter plus_one. |
michael@0 | 75 | |
michael@0 | 76 | UseWords -- In additon to scoring quad/uni/nil-grams, score complete words |
michael@0 | 77 | |
michael@0 | 78 | |
michael@0 | 79 | |
michael@0 | 80 | Tentative decision logic: |
michael@0 | 81 | |
michael@0 | 82 | In the middle of first pass -- After 4KB of text, look at the front 256 bytes |
michael@0 | 83 | of every full 4KB buffer. If it compresses very well (say 3:1) or has |
michael@0 | 84 | lots of spaces (say 1 of every 4 bytes), assume that the input is |
michael@0 | 85 | large and contains lots of bogus non-text. Recurse, passing the |
michael@0 | 86 | Squeeze flag to strip out chunks of this non-text. |
michael@0 | 87 | |
michael@0 | 88 | At the end of the first pass -- |
michael@0 | 89 | If the top language is reliable and >= 70% of the document, return. |
michael@0 | 90 | Else if the top language is reliable and top+2nd >= say 94%, return. |
michael@0 | 91 | Else, either the top language is not reliable or there is a lot of |
michael@0 | 92 | other crap. |
michael@0 | 93 | ***/ |
michael@0 | 94 | |
michael@0 | 95 | |
michael@0 | 96 | // Scan interchange-valid UTF-8 bytes and detect most likely language, |
michael@0 | 97 | // or set of languages. |
michael@0 | 98 | // |
michael@0 | 99 | // Design goals: |
michael@0 | 100 | // Skip over big stretches of HTML tags |
michael@0 | 101 | // Able to return ranges of different languages |
michael@0 | 102 | // Relatively small tables and relatively fast processing |
michael@0 | 103 | // Thread safe |
michael@0 | 104 | // |
michael@0 | 105 | |
michael@0 | 106 | typedef struct { |
michael@0 | 107 | int perscript_count; |
michael@0 | 108 | const Language* perscript_lang; |
michael@0 | 109 | } PerScriptPair; |
michael@0 | 110 | |
michael@0 | 111 | typedef struct { |
michael@0 | 112 | // Constants for hashing 4-7 byte quadgram to 32 bits |
michael@0 | 113 | const int kQuadHashB4Shift; |
michael@0 | 114 | const int kQuadHashB4bShift; |
michael@0 | 115 | const int kQuadHashB5Shift; |
michael@0 | 116 | const int kQuadHashB5bShift; |
michael@0 | 117 | // Constants for hashing 32 bits to kQuadKeyTable subscript/key |
michael@0 | 118 | const int kHashvalToSubShift; |
michael@0 | 119 | const uint32 kHashvalToSubMask; |
michael@0 | 120 | const int kHashvalToKeyShift; |
michael@0 | 121 | const uint32 kHashvalToKeyMask; |
michael@0 | 122 | const int kHashvalAssociativity; |
michael@0 | 123 | // Pointers to the actual tables |
michael@0 | 124 | const PerScriptPair* kPerScriptPair; |
michael@0 | 125 | const uint16* kQuadKeyTable; |
michael@0 | 126 | const uint32* kQuadValueTable; |
michael@0 | 127 | } LangDetObj; |
michael@0 | 128 | |
michael@0 | 129 | // For HTML documents, tags are skipped, along with <script> ... </script> |
michael@0 | 130 | // and <style> ... </style> sequences, and entities are expanded. |
michael@0 | 131 | // |
michael@0 | 132 | // We distinguish between bytes of the raw input buffer and bytes of non-tag |
michael@0 | 133 | // text letters. Since tags can be over 50% of the bytes of an HTML Page, |
michael@0 | 134 | // and are nearly all seven-bit ASCII English, we prefer to distinguish |
michael@0 | 135 | // language mixture fractions based on just the non-tag text. |
michael@0 | 136 | // |
michael@0 | 137 | // Inputs: text and text_length |
michael@0 | 138 | // is_plain_text if true says to NOT parse/skip HTML tags nor entities |
michael@0 | 139 | // Outputs: |
michael@0 | 140 | // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE |
michael@0 | 141 | // percent3 is an array of the text percentages 0..100 of the top 3 languages |
michael@0 | 142 | // normalized_score3 is an array of internal scores, normalized to the |
michael@0 | 143 | // average score for each language over a body of training text. A |
michael@0 | 144 | // normalized score significantly away from 1.0 indicates very skewed text |
michael@0 | 145 | // or gibberish. |
michael@0 | 146 | // |
michael@0 | 147 | // text_bytes is the amount of non-tag/letters-only text found |
michael@0 | 148 | // is_reliable set true if the returned Language is at least 2**30 times more |
michael@0 | 149 | // probable then the second-best Language |
michael@0 | 150 | // |
michael@0 | 151 | // Return value: the most likely Language for the majority of the input text |
michael@0 | 152 | // Length 0 input and text with no reliable letter sequences returns |
michael@0 | 153 | // UNKNOWN_LANGUAGE |
michael@0 | 154 | // |
michael@0 | 155 | // Subsetting: For fast detection over large documents, these routines will |
michael@0 | 156 | // only scan up to a fixed limit (currently 160KB of non-tag letters). |
michael@0 | 157 | // |
michael@0 | 158 | |
michael@0 | 159 | Language DetectLanguageSummaryV2( |
michael@0 | 160 | const char* buffer, |
michael@0 | 161 | int buffer_length, |
michael@0 | 162 | bool is_plain_text, |
michael@0 | 163 | const CLDHints* cld_hints, |
michael@0 | 164 | bool allow_extended_lang, |
michael@0 | 165 | int flags, |
michael@0 | 166 | Language plus_one, |
michael@0 | 167 | Language* language3, |
michael@0 | 168 | int* percent3, |
michael@0 | 169 | double* normalized_score3, |
michael@0 | 170 | ResultChunkVector* resultchunkvector, |
michael@0 | 171 | int* text_bytes, |
michael@0 | 172 | bool* is_reliable); |
michael@0 | 173 | |
michael@0 | 174 | // For unit testing: |
michael@0 | 175 | // Remove portions of text that have a high density of spaces, or that are |
michael@0 | 176 | // overly repetitive, squeezing the remaining text in-place to the front |
michael@0 | 177 | // of the input buffer. |
michael@0 | 178 | // Return the new, possibly-shorter length |
michael@0 | 179 | int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize); |
michael@0 | 180 | |
michael@0 | 181 | } // End namespace CLD2 |
michael@0 | 182 | |
michael@0 | 183 | #endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_ |