michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #include michael@0: #include michael@0: michael@0: #include "../public/compact_lang_det.h" michael@0: #include "../public/encodings.h" michael@0: #include "compact_lang_det_impl.h" michael@0: #include "integral_types.h" michael@0: #include "lang_script.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // String is "code_version - data_scrape_date" michael@0: //static const char* kDetectLanguageVersion = "V2.0 - 20130715"; michael@0: michael@0: michael@0: // Large-table version for all ~160 languages michael@0: // Small-table version for all ~60 languages michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect most likely language michael@0: Language DetectLanguage( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: bool* is_reliable) { michael@0: bool allow_extended_lang = false; michael@0: Language language3[3]; michael@0: int percent3[3]; michael@0: double normalized_score3[3]; michael@0: int text_bytes; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: const char* tld_hint = ""; michael@0: int encoding_hint = UNKNOWN_ENCODING; michael@0: Language language_hint = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: &text_bytes, michael@0: is_reliable); michael@0: // Default to English michael@0: if (lang == UNKNOWN_LANGUAGE) { michael@0: lang = ENGLISH; michael@0: } michael@0: return lang; michael@0: } michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. michael@0: Language DetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: double normalized_score3[3]; michael@0: bool allow_extended_lang = false; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: const char* tld_hint = ""; michael@0: int encoding_hint = UNKNOWN_ENCODING; michael@0: Language language_hint = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Default to English michael@0: if (lang == UNKNOWN_LANGUAGE) { michael@0: lang = ENGLISH; michael@0: } michael@0: return lang; michael@0: } michael@0: michael@0: // Same as above, with hints supplied michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. michael@0: Language DetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: double normalized_score3[3]; michael@0: bool allow_extended_lang = false; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Default to English michael@0: if (lang == UNKNOWN_LANGUAGE) { michael@0: lang = ENGLISH; michael@0: } michael@0: return lang; michael@0: } michael@0: michael@0: michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended michael@0: // languages. michael@0: // Extended languages are additional Google interface languages and Unicode michael@0: // single-language scripts, from ext_lang_enc.h michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: double normalized_score3[3]; michael@0: bool allow_extended_lang = true; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: const char* tld_hint = ""; michael@0: int encoding_hint = UNKNOWN_ENCODING; michael@0: Language language_hint = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Do not default to English michael@0: return lang; michael@0: } michael@0: michael@0: // Same as above, with hints supplied michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended michael@0: // languages. michael@0: // Extended languages are additional Google interface languages and Unicode michael@0: // single-language scripts, from ext_lang_enc.h michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: double normalized_score3[3]; michael@0: bool allow_extended_lang = true; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Do not default to English michael@0: return lang; michael@0: } michael@0: michael@0: // Same as above, and also returns internal language scores as a ratio to michael@0: // normal score for real text in that language. Scores close to 1.0 indicate michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or michael@0: // gibberish michael@0: // michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const char* tld_hint, // "id" boosts Indonesian michael@0: int encoding_hint, // SJS boosts Japanese michael@0: Language language_hint, // ITALIAN boosts it michael@0: Language* language3, michael@0: int* percent3, michael@0: double* normalized_score3, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: bool allow_extended_lang = true; michael@0: int flags = 0; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: &cldhints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: NULL, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Do not default to English michael@0: return lang; michael@0: } michael@0: michael@0: // Use this one. michael@0: // Hints are collected into a struct. michael@0: // Flags are passed in (normally zero). michael@0: // michael@0: // Also returns 3 internal language scores as a ratio to michael@0: // normal score for real text in that language. Scores close to 1.0 indicate michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or michael@0: // gibberish michael@0: // michael@0: // Returns a vector of chunks in different languages, so that caller may michael@0: // spell-check, translate, or otherwaise process different parts of the input michael@0: // buffer in language-dependant ways. michael@0: // michael@0: Language ExtDetectLanguageSummary( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const CLDHints* cld_hints, michael@0: int flags, michael@0: Language* language3, michael@0: int* percent3, michael@0: double* normalized_score3, michael@0: ResultChunkVector* resultchunkvector, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: bool allow_extended_lang = true; michael@0: Language plus_one = UNKNOWN_LANGUAGE; michael@0: michael@0: Language lang = DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: cld_hints, michael@0: allow_extended_lang, michael@0: flags, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: resultchunkvector, michael@0: text_bytes, michael@0: is_reliable); michael@0: // Do not default to English michael@0: return lang; michael@0: } michael@0: michael@0: } // End namespace CLD2 michael@0: