michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: // NOTE: michael@0: // Baybayin (ancient script of the Philippines) is detected as TAGALOG. michael@0: // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. michael@0: // HAITIAN_CREOLE is detected as such. michael@0: // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) michael@0: // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. michael@0: // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. michael@0: // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. michael@0: // MONTENEGRIN is not detected as such, but likely scores as Serbian. michael@0: // CROATIAN is detected in the Latin script michael@0: // SERBIAN is detected in the Cyrililc and Latin scripts michael@0: // Zhuang is detected in the Latin script only. michael@0: // michael@0: // The languages X_PIG_LATIN and X_KLINGON are detected in the michael@0: // extended calls ExtDetectLanguageSummary(). michael@0: // michael@0: // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure michael@0: // is high enough. This happens with non-text input such as the bytes of a michael@0: // JPEG, and also with text in languages outside training set. michael@0: // michael@0: // The following languages are to be detected in multiple scripts: michael@0: // AZERBAIJANI (Latin, Cyrillic*, Arabic*) michael@0: // BURMESE (Latin, Myanmar) michael@0: // HAUSA (Latin, Arabic) michael@0: // KASHMIRI (Arabic, Devanagari) michael@0: // KAZAKH (Latin, Cyrillic, Arabic) michael@0: // KURDISH (Latin*, Arabic) michael@0: // KYRGYZ (Cyrillic, Arabic) michael@0: // LIMBU (Devanagari, Limbu) michael@0: // MONGOLIAN (Cyrillic, Mongolian) michael@0: // SANSKRIT (Latin, Devanagari) michael@0: // SINDHI (Arabic, Devanagari) michael@0: // TAGALOG (Latin, Tagalog) michael@0: // TAJIK (Cyrillic, Arabic*) michael@0: // TATAR (Latin, Cyrillic, Arabic) michael@0: // TURKMEN (Latin, Cyrillic, Arabic) michael@0: // UIGHUR (Latin, Cyrillic, Arabic) michael@0: // UZBEK (Latin, Cyrillic, Arabic) michael@0: // michael@0: // * Due to a shortage of training text, AZERBAIJANI is not currently detected michael@0: // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in michael@0: // Arabic script. michael@0: // michael@0: michael@0: #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ michael@0: #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ michael@0: michael@0: #include michael@0: #include "../internal/lang_script.h" // For Language michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect most likely language, michael@0: // or set of languages. michael@0: // michael@0: // Design goals: michael@0: // Skip over big stretches of HTML tags michael@0: // Able to return ranges of different languages michael@0: // Relatively small tables and relatively fast processing michael@0: // Thread safe michael@0: // michael@0: // For HTML documents, tags are skipped, along with michael@0: // and sequences, and entities are expanded. michael@0: // michael@0: // We distinguish between bytes of the raw input buffer and bytes of non-tag michael@0: // text letters. Since tags can be over 50% of the bytes of an HTML Page, michael@0: // and are nearly all seven-bit ASCII English, we prefer to distinguish michael@0: // language mixture fractions based on just the non-tag text. michael@0: // michael@0: // Inputs: text and text_length michael@0: // Code skips HTML tags and expands HTML entities, unless michael@0: // is_plain_text is true michael@0: // Outputs: michael@0: // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE michael@0: // percent3 is an array of the text percentages 0..100 of the top 3 languages michael@0: // text_bytes is the amount of non-tag/letters-only text found michael@0: // is_reliable set true if the returned Language is some amount more michael@0: // probable then the second-best Language. Calculation is a complex function michael@0: // of the length of the text and the different-script runs of text. michael@0: // Return value: the most likely Language for the majority of the input text michael@0: // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text michael@0: // defaults to ENGLISH. michael@0: // michael@0: // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for michael@0: // backwards compatibility with a different detector. michael@0: // michael@0: // The third version may return UNKNOWN_LANGUAGE, and also returns extended michael@0: // language codes from lang_script.h michael@0: // michael@0: michael@0: michael@0: // Instead of individual arguments, pass in hints as an initialized struct michael@0: // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. michael@0: // michael@0: // Pass in hints whenever possible; doing so improves detection accuracy. The michael@0: // set of passed-in hints are all information that is external to the text michael@0: // itself. michael@0: // michael@0: // The content_language_hint is intended to come from an HTTP header michael@0: // Content-Language: field, the tld_hint from the hostname of a URL, the michael@0: // encoding-hint from an encoding detector applied to the input michael@0: // document, and the language hint from any other context you might have. michael@0: // The lang= tags inside an HTML document will be picked up as hints michael@0: // by code within the compact language detector. michael@0: michael@0: typedef struct { michael@0: const char* content_language_hint; // "mi,en" boosts Maori and English michael@0: const char* tld_hint; // "id" boosts Indonesian michael@0: int encoding_hint; // SJS boosts Japanese michael@0: Language language_hint; // ITALIAN boosts it michael@0: } CLDHints; michael@0: michael@0: static const int kMaxResultChunkBytes = 65535; michael@0: michael@0: // For returning a vector of per-language pieces of the input buffer michael@0: // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE michael@0: typedef struct { michael@0: int offset; // Starting byte offset in original buffer michael@0: uint16 bytes; // Number of bytes in chunk michael@0: uint16 lang1; // Top lang, as full Language. Apply michael@0: // static_cast() to this short value. michael@0: } ResultChunk; michael@0: typedef std::vector ResultChunkVector; michael@0: michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect most likely language michael@0: Language DetectLanguage( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: bool* is_reliable); michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. michael@0: // language3[0] is usually also the return value michael@0: Language DetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: // Same as above, with hints supplied michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. michael@0: // language3[0] is usually also the return value michael@0: Language DetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended michael@0: // languages. michael@0: // michael@0: // Extended languages are additional interface languages and Unicode michael@0: // single-language scripts, from lang_script.h michael@0: // michael@0: // language3[0] is usually also the return value michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: // Same as above, with hints supplied michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended michael@0: // languages. michael@0: // michael@0: // Extended languages are additional Google interface languages and Unicode michael@0: // single-language scripts, from lang_script.h michael@0: // michael@0: // language3[0] is usually also the return value michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: // Same as above, and also returns 3 internal language scores as a ratio to michael@0: // normal score for real text in that language. Scores close to 1.0 indicate michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or michael@0: // gibberish michael@0: // michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: double* normalized_score3, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: michael@0: // Use this one. michael@0: // Hints are collected into a struct. michael@0: // Flags are passed in (normally zero). michael@0: // michael@0: // Also returns 3 internal language scores as a ratio to michael@0: // normal score for real text in that language. Scores close to 1.0 indicate michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or michael@0: // gibberish michael@0: // michael@0: // Returns a vector of chunks in different languages, so that caller may michael@0: // spell-check, translate, or otherwaise process different parts of the input michael@0: // buffer in language-dependant ways. michael@0: // michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const CLDHints* cld_hints, michael@0: int flags, michael@0: Language* language3, michael@0: int* percent3, michael@0: double* normalized_score3, michael@0: ResultChunkVector* resultchunkvector, michael@0: int* text_bytes, michael@0: bool* is_reliable); michael@0: michael@0: // Return version text string michael@0: // String is "code_version - data_build_date" michael@0: const char* DetectLanguageVersion(); michael@0: michael@0: michael@0: // Public use flags, debug output controls michael@0: static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads michael@0: static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr michael@0: static const int kCLDFlagCr = 0x0400; // per chunk if HTML michael@0: static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr michael@0: static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr michael@0: static const int kCLDFlagEcho = 0x2000; // Echo input => stderr michael@0: michael@0: michael@0: /*** michael@0: michael@0: Flag meanings: michael@0: kCLDFlagScoreAsQuads michael@0: Normally, several languages are detected solely by their Unicode script. michael@0: Combined with appropritate lookup tables, this flag forces them instead michael@0: to be detected via quadgrams. This can be a useful refinement when looking michael@0: for meaningful text in these languages, instead of just character sets. michael@0: The default tables do not support this use. michael@0: kCLDFlagHtml michael@0: For each detection call, write an HTML file to stderr, showing the text michael@0: chunks and their detected languages. michael@0: kCLDFlagCr michael@0: In that HTML file, force a new line for each chunk. michael@0: kCLDFlagVerbose michael@0: In that HTML file, show every lookup entry. michael@0: kCLDFlagQuiet michael@0: In that HTML file, suppress most of the output detail. michael@0: kCLDFlagEcho michael@0: Echo every input buffer to stderr. michael@0: ***/ michael@0: michael@0: // Debug output: Print the resultchunkvector to file f michael@0: void DumpResultChunkVector(FILE* f, const char* src, michael@0: ResultChunkVector* resultchunkvector); michael@0: michael@0: #ifdef CLD2_DYNAMIC_MODE michael@0: michael@0: // If compiled with dynamic mode, load data from the specified file location. michael@0: // If other data has already been loaded, it is discarded and the data is read michael@0: // in from the specified file location again (even if the file has not changed). michael@0: // WARNING: Before calling this method, language detection will always fail michael@0: // and will always return the unknown language. michael@0: void loadData(const char* fileName); michael@0: michael@0: // If compiled with dynamic mode, unload the previously-loaded data. michael@0: // WARNING: After calling this method, language detection will no longer work michael@0: // and will always return the unknown language. michael@0: void unloadData(); michael@0: michael@0: // Returns true if and only if data has been loaded via a call to loadData(...) michael@0: // and has not been subsequently unladed via a call to unloadDate(). michael@0: bool isDataLoaded(); michael@0: michael@0: #endif // #ifdef CLD2_DYNAMIC_MODE michael@0: michael@0: }; // End namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_