1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det_impl.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,183 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_ 1.23 +#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_ 1.24 + 1.25 +#include <vector> 1.26 + 1.27 +#include "../public/compact_lang_det.h" // For CLDHints, ResultChunkVector 1.28 +#include "integral_types.h" 1.29 +#include "lang_script.h" 1.30 + 1.31 +namespace CLD2 { 1.32 + 1.33 +// Internal use flags 1.34 +static const int kCLDFlagFinish = 1; 1.35 +static const int kCLDFlagSqueeze = 2; 1.36 +static const int kCLDFlagRepeats = 4; 1.37 +static const int kCLDFlagTop40 = 8; 1.38 +static const int kCLDFlagShort = 16; 1.39 +static const int kCLDFlagHint = 32; 1.40 +static const int kCLDFlagUseWords = 64; 1.41 +static const int kCLDFlagUNUSED = 128; 1.42 + 1.43 +// Public use flags, debug output controls, defined in compact_lang_det.h 1.44 +// 0x0100 and above 1.45 + 1.46 +/*** 1.47 + 1.48 +Flag meanings: 1.49 + 1.50 +Flags are used in the context of a recursive call from Detect to itself, 1.51 +trying to deal in a more restrictive way with input that was not reliably 1.52 +identified in the top-level call. 1.53 + 1.54 +Finish -- Do not further recurse; return whatever result ensues, even if it is 1.55 + unreliable. Typically set in any recursive call to take a second try 1.56 + on unreliable text. 1.57 + 1.58 +Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of 1.59 + highly repetitive text and chunks of text with too many 1- and 1.60 + 2-letter words. This avoids scoring repetitive or useless non-text 1.61 + crap in large files such bogus JPEGs within an HTML file. 1.62 + 1.63 +Repeats -- When scoring a text run, do a cheap prediction of each character 1.64 + and do not score a unigram/quadgram if the last character of same is 1.65 + correctly predicted. This is a slower, finer-grained form of 1.66 + cheapsqueeze, typically used when the first pass got unreliable 1.67 + results. 1.68 + 1.69 +Top40 -- Restrict the set of scored languages to the Google "Top 40", which is 1.70 + actually 38 languages. This gets rid of about 110 languages that 1.71 + represent about 0.7% of the web. Typically used when the first pass 1.72 + got unreliable results. 1.73 + 1.74 +Short -- DEPRICATED, unused 1.75 + 1.76 +Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language 1.77 + hint supplied in parameter plus_one. 1.78 + 1.79 +UseWords -- In additon to scoring quad/uni/nil-grams, score complete words 1.80 + 1.81 + 1.82 + 1.83 +Tentative decision logic: 1.84 + 1.85 +In the middle of first pass -- After 4KB of text, look at the front 256 bytes 1.86 + of every full 4KB buffer. If it compresses very well (say 3:1) or has 1.87 + lots of spaces (say 1 of every 4 bytes), assume that the input is 1.88 + large and contains lots of bogus non-text. Recurse, passing the 1.89 + Squeeze flag to strip out chunks of this non-text. 1.90 + 1.91 +At the end of the first pass -- 1.92 + If the top language is reliable and >= 70% of the document, return. 1.93 + Else if the top language is reliable and top+2nd >= say 94%, return. 1.94 + Else, either the top language is not reliable or there is a lot of 1.95 + other crap. 1.96 +***/ 1.97 + 1.98 + 1.99 + // Scan interchange-valid UTF-8 bytes and detect most likely language, 1.100 + // or set of languages. 1.101 + // 1.102 + // Design goals: 1.103 + // Skip over big stretches of HTML tags 1.104 + // Able to return ranges of different languages 1.105 + // Relatively small tables and relatively fast processing 1.106 + // Thread safe 1.107 + // 1.108 + 1.109 + typedef struct { 1.110 + int perscript_count; 1.111 + const Language* perscript_lang; 1.112 + } PerScriptPair; 1.113 + 1.114 + typedef struct { 1.115 + // Constants for hashing 4-7 byte quadgram to 32 bits 1.116 + const int kQuadHashB4Shift; 1.117 + const int kQuadHashB4bShift; 1.118 + const int kQuadHashB5Shift; 1.119 + const int kQuadHashB5bShift; 1.120 + // Constants for hashing 32 bits to kQuadKeyTable subscript/key 1.121 + const int kHashvalToSubShift; 1.122 + const uint32 kHashvalToSubMask; 1.123 + const int kHashvalToKeyShift; 1.124 + const uint32 kHashvalToKeyMask; 1.125 + const int kHashvalAssociativity; 1.126 + // Pointers to the actual tables 1.127 + const PerScriptPair* kPerScriptPair; 1.128 + const uint16* kQuadKeyTable; 1.129 + const uint32* kQuadValueTable; 1.130 + } LangDetObj; 1.131 + 1.132 + // For HTML documents, tags are skipped, along with <script> ... </script> 1.133 + // and <style> ... </style> sequences, and entities are expanded. 1.134 + // 1.135 + // We distinguish between bytes of the raw input buffer and bytes of non-tag 1.136 + // text letters. Since tags can be over 50% of the bytes of an HTML Page, 1.137 + // and are nearly all seven-bit ASCII English, we prefer to distinguish 1.138 + // language mixture fractions based on just the non-tag text. 1.139 + // 1.140 + // Inputs: text and text_length 1.141 + // is_plain_text if true says to NOT parse/skip HTML tags nor entities 1.142 + // Outputs: 1.143 + // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE 1.144 + // percent3 is an array of the text percentages 0..100 of the top 3 languages 1.145 + // normalized_score3 is an array of internal scores, normalized to the 1.146 + // average score for each language over a body of training text. A 1.147 + // normalized score significantly away from 1.0 indicates very skewed text 1.148 + // or gibberish. 1.149 + // 1.150 + // text_bytes is the amount of non-tag/letters-only text found 1.151 + // is_reliable set true if the returned Language is at least 2**30 times more 1.152 + // probable then the second-best Language 1.153 + // 1.154 + // Return value: the most likely Language for the majority of the input text 1.155 + // Length 0 input and text with no reliable letter sequences returns 1.156 + // UNKNOWN_LANGUAGE 1.157 + // 1.158 + // Subsetting: For fast detection over large documents, these routines will 1.159 + // only scan up to a fixed limit (currently 160KB of non-tag letters). 1.160 + // 1.161 + 1.162 + Language DetectLanguageSummaryV2( 1.163 + const char* buffer, 1.164 + int buffer_length, 1.165 + bool is_plain_text, 1.166 + const CLDHints* cld_hints, 1.167 + bool allow_extended_lang, 1.168 + int flags, 1.169 + Language plus_one, 1.170 + Language* language3, 1.171 + int* percent3, 1.172 + double* normalized_score3, 1.173 + ResultChunkVector* resultchunkvector, 1.174 + int* text_bytes, 1.175 + bool* is_reliable); 1.176 + 1.177 + // For unit testing: 1.178 + // Remove portions of text that have a high density of spaces, or that are 1.179 + // overly repetitive, squeezing the remaining text in-place to the front 1.180 + // of the input buffer. 1.181 + // Return the new, possibly-shorter length 1.182 + int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize); 1.183 + 1.184 +} // End namespace CLD2 1.185 + 1.186 +#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_