1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/tote.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,265 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#include "tote.h" 1.23 +#include "lang_script.h" // For LanguageCode in Dump 1.24 + 1.25 +#include <stdio.h> 1.26 +#include <string.h> // For memset 1.27 + 1.28 +namespace CLD2 { 1.29 + 1.30 +// Take a set of <key, value> pairs and tote them up. 1.31 +// After explicitly sorting, retrieve top key, value pairs 1.32 +// Normal use is key=per-script language and value = probability score 1.33 +Tote::Tote() { 1.34 + in_use_mask_ = 0; 1.35 + byte_count_ = 0; 1.36 + score_count_ = 0; 1.37 + // No need to initialize values 1.38 +} 1.39 + 1.40 +Tote::~Tote() { 1.41 +} 1.42 + 1.43 +void Tote::Reinit() { 1.44 + in_use_mask_ = 0; 1.45 + byte_count_ = 0; 1.46 + score_count_ = 0; 1.47 + // No need to initialize values 1.48 +} 1.49 +// Increment count of quadgrams/trigrams/unigrams scored 1.50 +void Tote::AddScoreCount() { 1.51 + ++score_count_; 1.52 +} 1.53 + 1.54 + 1.55 +void Tote::Add(uint8 ikey, int idelta) { 1.56 + int key_group = ikey >> 2; 1.57 + uint64 groupmask = (1ULL << key_group); 1.58 + if ((in_use_mask_ & groupmask) == 0) { 1.59 + // Initialize this group 1.60 + gscore_[key_group] = 0; 1.61 + in_use_mask_ |= groupmask; 1.62 + } 1.63 + score_[ikey] += idelta; 1.64 +} 1.65 + 1.66 + 1.67 +// Return current top three keys 1.68 +void Tote::CurrentTopThreeKeys(int* key3) const { 1.69 + key3[0] = -1; 1.70 + key3[1] = -1; 1.71 + key3[2] = -1; 1.72 + int score3[3] = {-1, -1, -1}; 1.73 + uint64 tempmask = in_use_mask_; 1.74 + int base = 0; 1.75 + while (tempmask != 0) { 1.76 + if (tempmask & 1) { 1.77 + // Look at four in-use keys 1.78 + for (int i = 0; i < 4; ++i) { 1.79 + int insert_me = score_[base + i]; 1.80 + // Favor lower numbers on ties 1.81 + if (insert_me > score3[2]) { 1.82 + // Insert 1.83 + int insert_at = 2; 1.84 + if (insert_me > score3[1]) { 1.85 + score3[2] = score3[1]; 1.86 + key3[2] = key3[1]; 1.87 + insert_at = 1; 1.88 + if (insert_me > score3[0]) { 1.89 + score3[1] = score3[0]; 1.90 + key3[1] = key3[0]; 1.91 + insert_at = 0; 1.92 + } 1.93 + } 1.94 + score3[insert_at] = insert_me; 1.95 + key3[insert_at] = base + i; 1.96 + } 1.97 + } 1.98 + } 1.99 + tempmask >>= 1; 1.100 + base += 4; 1.101 + } 1.102 +} 1.103 + 1.104 + 1.105 +// Take a set of <key, value> pairs and tote them up. 1.106 +// After explicitly sorting, retrieve top key, value pairs 1.107 +// 0xFFFF in key signifies unused 1.108 +DocTote::DocTote() { 1.109 + // No need to initialize score_ or value_ 1.110 + incr_count_ = 0; 1.111 + sorted_ = 0; 1.112 + memset(closepair_, 0, sizeof(closepair_)); 1.113 + memset(key_, 0xFF, sizeof(key_)); 1.114 +} 1.115 + 1.116 +DocTote::~DocTote() { 1.117 +} 1.118 + 1.119 +void DocTote::Reinit() { 1.120 + // No need to initialize score_ or value_ 1.121 + incr_count_ = 0; 1.122 + sorted_ = 0; 1.123 + memset(closepair_, 0, sizeof(closepair_)); 1.124 + memset(key_, 0xFF, sizeof(key_)); 1.125 + runningscore_.Reinit(); 1.126 +} 1.127 + 1.128 +// Weight reliability by ibytes 1.129 +// Also see three-way associative comments above for Tote 1.130 +void DocTote::Add(uint16 ikey, int ibytes, 1.131 + int score, int ireliability) { 1.132 + ++incr_count_; 1.133 + 1.134 + // Look for existing entry in top 2 positions of 3, times 8 columns 1.135 + int sub0 = ikey & 15; 1.136 + if (key_[sub0] == ikey) { 1.137 + value_[sub0] += ibytes; 1.138 + score_[sub0] += score; 1.139 + reliability_[sub0] += ireliability * ibytes; 1.140 + return; 1.141 + } 1.142 + // Look for existing entry in other of top 2 positions of 3, times 8 columns 1.143 + int sub1 = sub0 ^ 8; 1.144 + if (key_[sub1] == ikey) { 1.145 + value_[sub1] += ibytes; 1.146 + score_[sub1] += score; 1.147 + reliability_[sub1] += ireliability * ibytes; 1.148 + return; 1.149 + } 1.150 + // Look for existing entry in third position of 3, times 8 columns 1.151 + int sub2 = (ikey & 7) + 16; 1.152 + if (key_[sub2] == ikey) { 1.153 + value_[sub2] += ibytes; 1.154 + score_[sub2] += score; 1.155 + reliability_[sub2] += ireliability * ibytes; 1.156 + return; 1.157 + } 1.158 + 1.159 + // Allocate new entry 1.160 + int alloc = -1; 1.161 + if (key_[sub0] == kUnusedKey) { 1.162 + alloc = sub0; 1.163 + } else if (key_[sub1] == kUnusedKey) { 1.164 + alloc = sub1; 1.165 + } else if (key_[sub2] == kUnusedKey) { 1.166 + alloc = sub2; 1.167 + } else { 1.168 + // All choices allocated, need to replace smallest one 1.169 + alloc = sub0; 1.170 + if (value_[sub1] < value_[alloc]) {alloc = sub1;} 1.171 + if (value_[sub2] < value_[alloc]) {alloc = sub2;} 1.172 + } 1.173 + key_[alloc] = ikey; 1.174 + value_[alloc] = ibytes; 1.175 + score_[alloc] = score; 1.176 + reliability_[alloc] = ireliability * ibytes; 1.177 + return; 1.178 +} 1.179 + 1.180 +// Find subscript of a given packed language, or -1 1.181 +int DocTote::Find(uint16 ikey) { 1.182 + if (sorted_) { 1.183 + // Linear search if sorted 1.184 + for (int sub = 0; sub < kMaxSize_; ++sub) { 1.185 + if (key_[sub] == ikey) {return sub;} 1.186 + } 1.187 + return -1; 1.188 + } 1.189 + 1.190 + // Look for existing entry 1.191 + int sub0 = ikey & 15; 1.192 + if (key_[sub0] == ikey) { 1.193 + return sub0; 1.194 + } 1.195 + int sub1 = sub0 ^ 8; 1.196 + if (key_[sub1] == ikey) { 1.197 + return sub1; 1.198 + } 1.199 + int sub2 = (ikey & 7) + 16; 1.200 + if (key_[sub2] == ikey) { 1.201 + return sub2; 1.202 + } 1.203 + 1.204 + return -1; 1.205 +} 1.206 + 1.207 +// Return current top key 1.208 +int DocTote::CurrentTopKey() { 1.209 + int top_key = 0; 1.210 + int top_value = -1; 1.211 + for (int sub = 0; sub < kMaxSize_; ++sub) { 1.212 + if (key_[sub] == kUnusedKey) {continue;} 1.213 + if (top_value < value_[sub]) { 1.214 + top_value = value_[sub]; 1.215 + top_key = key_[sub]; 1.216 + } 1.217 + } 1.218 + return top_key; 1.219 +} 1.220 + 1.221 + 1.222 +// Sort first n entries by decreasing order of value 1.223 +// If key==0 other fields are not valid, treat value as -1 1.224 +void DocTote::Sort(int n) { 1.225 + // This is n**2, but n is small 1.226 + for (int sub = 0; sub < n; ++sub) { 1.227 + if (key_[sub] == kUnusedKey) {value_[sub] = -1;} 1.228 + 1.229 + // Bubble sort key[sub] and entry[sub] 1.230 + for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { 1.231 + if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} 1.232 + if (value_[sub] < value_[sub2]) { 1.233 + // swap 1.234 + uint16 tmpk = key_[sub]; 1.235 + key_[sub] = key_[sub2]; 1.236 + key_[sub2] = tmpk; 1.237 + 1.238 + int tmpv = value_[sub]; 1.239 + value_[sub] = value_[sub2]; 1.240 + value_[sub2] = tmpv; 1.241 + 1.242 + double tmps = score_[sub]; 1.243 + score_[sub] = score_[sub2]; 1.244 + score_[sub2] = tmps; 1.245 + 1.246 + int tmpr = reliability_[sub]; 1.247 + reliability_[sub] = reliability_[sub2]; 1.248 + reliability_[sub2] = tmpr; 1.249 + } 1.250 + } 1.251 + } 1.252 + sorted_ = 1; 1.253 +} 1.254 + 1.255 +void DocTote::Dump(FILE* f) { 1.256 + fprintf(f, "DocTote::Dump\n"); 1.257 + for (int sub = 0; sub < kMaxSize_; ++sub) { 1.258 + if (key_[sub] != kUnusedKey) { 1.259 + Language lang = static_cast<Language>(key_[sub]); 1.260 + fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), 1.261 + value_[sub], score_[sub], reliability_[sub]); 1.262 + } 1.263 + } 1.264 + fprintf(f, " %d chunks scored<br>\n", incr_count_); 1.265 +} 1.266 + 1.267 +} // End namespace CLD2 1.268 +