michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__ michael@0: #define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__ michael@0: michael@0: michael@0: #include michael@0: #include "integral_types.h" michael@0: #include "lang_script.h" michael@0: #include "../public/encodings.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Packed , weight in [-32..31] (powers of 2**1.6 ~=3.03) michael@0: // Full language in bottom 10 bits, weight in top 6 bits michael@0: typedef int16 OneCLDLangPrior; michael@0: michael@0: const int kMaxOneCLDLangPrior = 14; michael@0: typedef struct { michael@0: int32 n; michael@0: OneCLDLangPrior prior[kMaxOneCLDLangPrior]; michael@0: } CLDLangPriors; michael@0: michael@0: // Reading exposed here; setting hidden in .cc michael@0: inline int GetCLDPriorWeight(OneCLDLangPrior olp) { michael@0: return olp >> 10; michael@0: } michael@0: inline Language GetCLDPriorLang(OneCLDLangPrior olp) { michael@0: return static_cast(olp & 0x3ff); michael@0: } michael@0: michael@0: inline int32 GetCLDLangPriorCount(CLDLangPriors* lps) { michael@0: return lps->n; michael@0: } michael@0: michael@0: inline void InitCLDLangPriors(CLDLangPriors* lps) { michael@0: lps->n = 0; michael@0: } michael@0: michael@0: // Trim language priors to no more than max_entries, keeping largest abs weights michael@0: void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps); michael@0: michael@0: // Trim language tag string to canonical form for each language michael@0: // Input is from GetLangTagsFromHtml(), already lowercased michael@0: std::string TrimCLDLangTagsHint(const std::string& langtags); michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from GetLangTagsFromHtml(), already lowercased michael@0: void SetCLDLangTagsHint(const std::string& langtags, CLDLangPriors* langpriors); michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from HTTP content-language michael@0: void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors); michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from GetTLD(), already lowercased michael@0: void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors); michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from DetectEncoding() michael@0: void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors); michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from random source michael@0: void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors); michael@0: michael@0: // Make printable string of priors michael@0: std::string DumpCLDLangPriors(const CLDLangPriors* langpriors); michael@0: michael@0: michael@0: // Get language tag hints from HTML body michael@0: // Normalize: remove spaces and make lowercase comma list michael@0: std::string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, michael@0: int32 max_scan_bytes); michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__ michael@0: