The Tor Browser: browser/components/translation/cld2/internal/compact_lang

browser/components/translation/cld2/internal/compact_lang_det.cc@6474c204b198 (annotated)

browser/components/translation/cld2/internal/compact_lang_det.cc

Wed, 31 Dec 2014 06:09:35 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Wed, 31 Dec 2014 06:09:35 +0100
changeset 0: 6474c204b198
permissions: -rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Author: dsites@google.com (Dick Sites)
 //
 #include <stdio.h>
 #include <stdlib.h>
 #include "../public/compact_lang_det.h"
 #include "../public/encodings.h"
 #include "compact_lang_det_impl.h"
 #include "integral_types.h"
 #include "lang_script.h"
 namespace CLD2 {
 // String is "code_version - data_scrape_date"
 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
 // Large-table version for all ~160 languages
 // Small-table version for all ~60 languages
 // Scan interchange-valid UTF-8 bytes and detect most likely language
 Language DetectLanguage(
                           const char* buffer,
                           int buffer_length,
                           bool is_plain_text,
                           bool* is_reliable) {
   bool allow_extended_lang = false;
   Language language3[3];
   int percent3[3];
   double normalized_score3[3];
   int text_bytes;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   const char* tld_hint = "";
   int encoding_hint = UNKNOWN_ENCODING;
   Language language_hint = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           &text_bytes,
                           is_reliable);
   // Default to English
   if (lang == UNKNOWN_LANGUAGE) {
     lang = ENGLISH;
   }
   return lang;
 }
 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 Language DetectLanguageSummary(
                           const char* buffer,
                           int buffer_length,
                           bool is_plain_text,
                           Language* language3,
                           int* percent3,
                           int* text_bytes,
                           bool* is_reliable) {
   double normalized_score3[3];
   bool allow_extended_lang = false;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   const char* tld_hint = "";
   int encoding_hint = UNKNOWN_ENCODING;
   Language language_hint = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           text_bytes,
                           is_reliable);
   // Default to English
   if (lang == UNKNOWN_LANGUAGE) {
     lang = ENGLISH;
   }
   return lang;
 }
 // Same as above, with hints supplied
 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 Language DetectLanguageSummary(
                           const char* buffer,
                           int buffer_length,
                           bool is_plain_text,
                           const char* tld_hint,       // "id" boosts Indonesian
                           int encoding_hint,          // SJS boosts Japanese
                           Language language_hint,     // ITALIAN boosts it
                           Language* language3,
                           int* percent3,
                           int* text_bytes,
                           bool* is_reliable) {
   double normalized_score3[3];
   bool allow_extended_lang = false;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           text_bytes,
                           is_reliable);
   // Default to English
   if (lang == UNKNOWN_LANGUAGE) {
     lang = ENGLISH;
   }
   return lang;
 }
 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 // languages.
 // Extended languages are additional Google interface languages and Unicode
 // single-language scripts, from ext_lang_enc.h
 Language ExtDetectLanguageSummary(
                           const char* buffer,
                           int buffer_length,
                           bool is_plain_text,
                           Language* language3,
                           int* percent3,
                           int* text_bytes,
                           bool* is_reliable) {
   double normalized_score3[3];
   bool allow_extended_lang = true;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   const char* tld_hint = "";
   int encoding_hint = UNKNOWN_ENCODING;
   Language language_hint = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           text_bytes,
                           is_reliable);
   // Do not default to English
   return lang;
 }
 // Same as above, with hints supplied
 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 // languages.
 // Extended languages are additional Google interface languages and Unicode
 // single-language scripts, from ext_lang_enc.h
 Language ExtDetectLanguageSummary(
                           const char* buffer,
                           int buffer_length,
                           bool is_plain_text,
                           const char* tld_hint,       // "id" boosts Indonesian
                           int encoding_hint,          // SJS boosts Japanese
                           Language language_hint,     // ITALIAN boosts it
                           Language* language3,
                           int* percent3,
                           int* text_bytes,
                           bool* is_reliable) {
   double normalized_score3[3];
   bool allow_extended_lang = true;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           text_bytes,
                           is_reliable);
   // Do not default to English
   return lang;
 }
 // Same as above, and also returns internal language scores as a ratio to
 // normal score for real text in that language. Scores close to 1.0 indicate
 // normal text, while scores far away from 1.0 indicate badly-skewed text or
 // gibberish
 //
 Language ExtDetectLanguageSummary(
                         const char* buffer,
                         int buffer_length,
                         bool is_plain_text,
                         const char* tld_hint,       // "id" boosts Indonesian
                         int encoding_hint,          // SJS boosts Japanese
                         Language language_hint,     // ITALIAN boosts it
                         Language* language3,
                         int* percent3,
                         double* normalized_score3,
                         int* text_bytes,
                         bool* is_reliable) {
   bool allow_extended_lang = true;
   int flags = 0;
   Language plus_one = UNKNOWN_LANGUAGE;
   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           &cldhints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           NULL,
                           text_bytes,
                           is_reliable);
   // Do not default to English
   return lang;
 }
 // Use this one.
 // Hints are collected into a struct.
 // Flags are passed in (normally zero).
 //
 // Also returns 3 internal language scores as a ratio to
 // normal score for real text in that language. Scores close to 1.0 indicate
 // normal text, while scores far away from 1.0 indicate badly-skewed text or
 // gibberish
 //
 // Returns a vector of chunks in different languages, so that caller may
 // spell-check, translate, or otherwaise process different parts of the input
 // buffer in language-dependant ways.
 //
 Language ExtDetectLanguageSummary(
                         const char* buffer,
                         int buffer_length,
                         bool is_plain_text,
                         const CLDHints* cld_hints,
                         int flags,
                         Language* language3,
                         int* percent3,
                         double* normalized_score3,
                         ResultChunkVector* resultchunkvector,
                         int* text_bytes,
                         bool* is_reliable) {
   bool allow_extended_lang = true;
   Language plus_one = UNKNOWN_LANGUAGE;
   Language lang = DetectLanguageSummaryV2(
                           buffer,
                           buffer_length,
                           is_plain_text,
                           cld_hints,
                           allow_extended_lang,
                           flags,
                           plus_one,
                           language3,
                           percent3,
                           normalized_score3,
                           resultchunkvector,
                           text_bytes,
                           is_reliable);
   // Do not default to English
   return lang;
 }
 }       // End namespace CLD2

The Tor Browser / annotate

browser/components/translation/cld2/internal/compact_lang_det.cc@6474c204b198 (annotated)

browser/components/translation/cld2/internal/compact_lang_det.cc