michael@0: // Copyright 2013 Google Inc. All Rights Reserved.
michael@0: //
michael@0: // Licensed under the Apache License, Version 2.0 (the "License");
michael@0: // you may not use this file except in compliance with the License.
michael@0: // You may obtain a copy of the License at
michael@0: //
michael@0: //     http://www.apache.org/licenses/LICENSE-2.0
michael@0: //
michael@0: // Unless required by applicable law or agreed to in writing, software
michael@0: // distributed under the License is distributed on an "AS IS" BASIS,
michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0: // See the License for the specific language governing permissions and
michael@0: // limitations under the License.
michael@0: 
michael@0: //
michael@0: // Author: dsites@google.com (Dick Sites)
michael@0: //
michael@0: 
michael@0: #include <stdio.h>
michael@0: #include <stdlib.h>
michael@0: 
michael@0: #include "../public/compact_lang_det.h"
michael@0: #include "../public/encodings.h"
michael@0: #include "compact_lang_det_impl.h"
michael@0: #include "integral_types.h"
michael@0: #include "lang_script.h"
michael@0: 
michael@0: namespace CLD2 {
michael@0: 
michael@0: // String is "code_version - data_scrape_date"
michael@0: //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
michael@0: 
michael@0: 
michael@0: // Large-table version for all ~160 languages
michael@0: // Small-table version for all ~60 languages
michael@0: 
michael@0: // Scan interchange-valid UTF-8 bytes and detect most likely language
michael@0: Language DetectLanguage(
michael@0:                           const char* buffer,
michael@0:                           int buffer_length,
michael@0:                           bool is_plain_text,
michael@0:                           bool* is_reliable) {
michael@0:   bool allow_extended_lang = false;
michael@0:   Language language3[3];
michael@0:   int percent3[3];
michael@0:   double normalized_score3[3];
michael@0:   int text_bytes;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   const char* tld_hint = "";
michael@0:   int encoding_hint = UNKNOWN_ENCODING;
michael@0:   Language language_hint = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           &text_bytes,
michael@0:                           is_reliable);
michael@0:   // Default to English
michael@0:   if (lang == UNKNOWN_LANGUAGE) {
michael@0:     lang = ENGLISH;
michael@0:   }
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0: Language DetectLanguageSummary(
michael@0:                           const char* buffer,
michael@0:                           int buffer_length,
michael@0:                           bool is_plain_text,
michael@0:                           Language* language3,
michael@0:                           int* percent3,
michael@0:                           int* text_bytes,
michael@0:                           bool* is_reliable) {
michael@0:   double normalized_score3[3];
michael@0:   bool allow_extended_lang = false;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   const char* tld_hint = "";
michael@0:   int encoding_hint = UNKNOWN_ENCODING;
michael@0:   Language language_hint = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Default to English
michael@0:   if (lang == UNKNOWN_LANGUAGE) {
michael@0:     lang = ENGLISH;
michael@0:   }
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: // Same as above, with hints supplied
michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0: Language DetectLanguageSummary(
michael@0:                           const char* buffer,
michael@0:                           int buffer_length,
michael@0:                           bool is_plain_text,
michael@0:                           const char* tld_hint,       // "id" boosts Indonesian
michael@0:                           int encoding_hint,          // SJS boosts Japanese
michael@0:                           Language language_hint,     // ITALIAN boosts it
michael@0:                           Language* language3,
michael@0:                           int* percent3,
michael@0:                           int* text_bytes,
michael@0:                           bool* is_reliable) {
michael@0:   double normalized_score3[3];
michael@0:   bool allow_extended_lang = false;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Default to English
michael@0:   if (lang == UNKNOWN_LANGUAGE) {
michael@0:     lang = ENGLISH;
michael@0:   }
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0: // languages.
michael@0: // Extended languages are additional Google interface languages and Unicode
michael@0: // single-language scripts, from ext_lang_enc.h
michael@0: Language ExtDetectLanguageSummary(
michael@0:                           const char* buffer,
michael@0:                           int buffer_length,
michael@0:                           bool is_plain_text,
michael@0:                           Language* language3,
michael@0:                           int* percent3,
michael@0:                           int* text_bytes,
michael@0:                           bool* is_reliable) {
michael@0:   double normalized_score3[3];
michael@0:   bool allow_extended_lang = true;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   const char* tld_hint = "";
michael@0:   int encoding_hint = UNKNOWN_ENCODING;
michael@0:   Language language_hint = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Do not default to English
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: // Same as above, with hints supplied
michael@0: // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0: // languages.
michael@0: // Extended languages are additional Google interface languages and Unicode
michael@0: // single-language scripts, from ext_lang_enc.h
michael@0: Language ExtDetectLanguageSummary(
michael@0:                           const char* buffer,
michael@0:                           int buffer_length,
michael@0:                           bool is_plain_text,
michael@0:                           const char* tld_hint,       // "id" boosts Indonesian
michael@0:                           int encoding_hint,          // SJS boosts Japanese
michael@0:                           Language language_hint,     // ITALIAN boosts it
michael@0:                           Language* language3,
michael@0:                           int* percent3,
michael@0:                           int* text_bytes,
michael@0:                           bool* is_reliable) {
michael@0:   double normalized_score3[3];
michael@0:   bool allow_extended_lang = true;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Do not default to English
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: // Same as above, and also returns internal language scores as a ratio to
michael@0: // normal score for real text in that language. Scores close to 1.0 indicate
michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0: // gibberish
michael@0: //
michael@0: Language ExtDetectLanguageSummary(
michael@0:                         const char* buffer,
michael@0:                         int buffer_length,
michael@0:                         bool is_plain_text,
michael@0:                         const char* tld_hint,       // "id" boosts Indonesian
michael@0:                         int encoding_hint,          // SJS boosts Japanese
michael@0:                         Language language_hint,     // ITALIAN boosts it
michael@0:                         Language* language3,
michael@0:                         int* percent3,
michael@0:                         double* normalized_score3,
michael@0:                         int* text_bytes,
michael@0:                         bool* is_reliable) {
michael@0:   bool allow_extended_lang = true;
michael@0:   int flags = 0;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0:   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           &cldhints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           NULL,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Do not default to English
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: // Use this one.
michael@0: // Hints are collected into a struct.
michael@0: // Flags are passed in (normally zero).
michael@0: //
michael@0: // Also returns 3 internal language scores as a ratio to
michael@0: // normal score for real text in that language. Scores close to 1.0 indicate
michael@0: // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0: // gibberish
michael@0: //
michael@0: // Returns a vector of chunks in different languages, so that caller may
michael@0: // spell-check, translate, or otherwaise process different parts of the input
michael@0: // buffer in language-dependant ways.
michael@0: //
michael@0: Language ExtDetectLanguageSummary(
michael@0:                         const char* buffer,
michael@0:                         int buffer_length,
michael@0:                         bool is_plain_text,
michael@0:                         const CLDHints* cld_hints,
michael@0:                         int flags,
michael@0:                         Language* language3,
michael@0:                         int* percent3,
michael@0:                         double* normalized_score3,
michael@0:                         ResultChunkVector* resultchunkvector,
michael@0:                         int* text_bytes,
michael@0:                         bool* is_reliable) {
michael@0:   bool allow_extended_lang = true;
michael@0:   Language plus_one = UNKNOWN_LANGUAGE;
michael@0: 
michael@0:   Language lang = DetectLanguageSummaryV2(
michael@0:                           buffer,
michael@0:                           buffer_length,
michael@0:                           is_plain_text,
michael@0:                           cld_hints,
michael@0:                           allow_extended_lang,
michael@0:                           flags,
michael@0:                           plus_one,
michael@0:                           language3,
michael@0:                           percent3,
michael@0:                           normalized_score3,
michael@0:                           resultchunkvector,
michael@0:                           text_bytes,
michael@0:                           is_reliable);
michael@0:   // Do not default to English
michael@0:   return lang;
michael@0: }
michael@0: 
michael@0: }       // End namespace CLD2
michael@0: