1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/tote.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,112 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_ 1.23 +#define I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_ 1.24 + 1.25 +#include <stdio.h> 1.26 +#include "integral_types.h" // for uint8 etc 1.27 + 1.28 +namespace CLD2 { 1.29 + 1.30 + 1.31 +// Take a set of <key, score> pairs and tote them up. 1.32 +// Key is an 8-bit per-script language 1.33 +// After explicitly sorting, retrieve top key, score pairs 1.34 +// Normal use is key=per-script language 1.35 +// The main data structure is an array of 256 uint16 counts. We normally 1.36 +// expect this to be initialized, added-to about 60 times, then the top three 1.37 +// items found. The reduce the initial and final time, we also keep a bit vector 1.38 +// of unused (and uninitialized) parts, each of 64 bits covering four keys. 1.39 +class Tote { 1.40 + public: 1.41 + Tote(); 1.42 + ~Tote(); 1.43 + void Reinit(); 1.44 + void AddScoreCount(); 1.45 + void Add(uint8 ikey, int idelta); 1.46 + void AddBytes(int ibytes) {byte_count_ += ibytes;} 1.47 + void CurrentTopThreeKeys(int* key3) const; 1.48 + int GetScoreCount() const {return score_count_;} 1.49 + int GetByteCount() const {return byte_count_;} 1.50 + int GetScore(int i) const {return score_[i];} 1.51 + void SetScoreCount(uint16 v) {score_count_ = v;} 1.52 + void SetScore(int i, int v) {score_[i] = v;} 1.53 + 1.54 + private: 1.55 + uint64 in_use_mask_; // 64 bits, one for each group of 4 scores. 1.56 + // 0 = not initialized,not used 1.57 + int byte_count_; // Bytes of text scored 1.58 + int score_count_; // Number of quadgrams/etc. scored 1.59 + union { 1.60 + uint64 gscore_[64]; // For alignment and clearing quickly 1.61 + uint16 score_[256]; // Probability score sum 1.62 + }; 1.63 + 1.64 +}; 1.65 + 1.66 + 1.67 +// Take a set of <key, score, reliability> triples and tote them up. 1.68 +// Key is a 16-bit full language 1.69 +// After explicitly sorting, retrieve top key, score, reliability triples 1.70 +class DocTote { 1.71 + public: 1.72 + DocTote(); 1.73 + ~DocTote(); 1.74 + void Reinit(); 1.75 + void Add(uint16 ikey, int ibytes, int score, int ireliability); 1.76 + int Find(uint16 ikey); 1.77 + void AddClosePair(int subscr, int val) {closepair_[subscr] += val;} 1.78 + int CurrentTopKey(); 1.79 + Tote* RunningScore() {return &runningscore_;} 1.80 + void Sort(int n); 1.81 + void Dump(FILE* f); 1.82 + 1.83 + int GetIncrCount() const {return incr_count_;} 1.84 + int GetClosePair(int subscr) const {return closepair_[subscr];} 1.85 + int MaxSize() const {return kMaxSize_;} 1.86 + uint16 Key(int i) const {return key_[i];} 1.87 + int Value(int i) const {return value_[i];} // byte count 1.88 + int Score(int i) const {return score_[i];} // sum lg prob 1.89 + int Reliability(int i) const {return reliability_[i];} 1.90 + void SetKey(int i, int v) {key_[i] = v;} 1.91 + void SetValue(int i, int v) {value_[i] = v;} 1.92 + void SetScore(int i, int v) {score_[i] = v;} 1.93 + void SetReliability(int i, int v) {reliability_[i] = v;} 1.94 + 1.95 + static const uint16 kUnusedKey = 0xFFFF; 1.96 + 1.97 + private: 1.98 + static const int kMaxSize_ = 24; 1.99 + static const int kMaxClosePairSize_ = 8; 1.100 + 1.101 + int incr_count_; // Number of Add calls 1.102 + int sorted_; // Contents have been sorted, cannot Add 1.103 + Tote runningscore_; // Top lang scores across entire doc, for 1.104 + // helping resolve close pairs 1.105 + // Align at multiple of 8 bytes 1.106 + int closepair_[kMaxClosePairSize_]; 1.107 + uint16 key_[kMaxSize_]; // Lang unassigned = 0xFFFF, valid = 1..1023 1.108 + int value_[kMaxSize_]; // Bytecount this lang 1.109 + int score_[kMaxSize_]; // Probability score sum 1.110 + int reliability_[kMaxSize_]; // Percentage 0..100 1.111 +}; 1.112 + 1.113 +} // End namespace CLD2 1.114 + 1.115 +#endif // I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_