browser/components/translation/cld2/internal/tote.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19 #include "tote.h"
michael@0 20 #include "lang_script.h" // For LanguageCode in Dump
michael@0 21
michael@0 22 #include <stdio.h>
michael@0 23 #include <string.h> // For memset
michael@0 24
michael@0 25 namespace CLD2 {
michael@0 26
michael@0 27 // Take a set of <key, value> pairs and tote them up.
michael@0 28 // After explicitly sorting, retrieve top key, value pairs
michael@0 29 // Normal use is key=per-script language and value = probability score
michael@0 30 Tote::Tote() {
michael@0 31 in_use_mask_ = 0;
michael@0 32 byte_count_ = 0;
michael@0 33 score_count_ = 0;
michael@0 34 // No need to initialize values
michael@0 35 }
michael@0 36
michael@0 37 Tote::~Tote() {
michael@0 38 }
michael@0 39
michael@0 40 void Tote::Reinit() {
michael@0 41 in_use_mask_ = 0;
michael@0 42 byte_count_ = 0;
michael@0 43 score_count_ = 0;
michael@0 44 // No need to initialize values
michael@0 45 }
michael@0 46 // Increment count of quadgrams/trigrams/unigrams scored
michael@0 47 void Tote::AddScoreCount() {
michael@0 48 ++score_count_;
michael@0 49 }
michael@0 50
michael@0 51
michael@0 52 void Tote::Add(uint8 ikey, int idelta) {
michael@0 53 int key_group = ikey >> 2;
michael@0 54 uint64 groupmask = (1ULL << key_group);
michael@0 55 if ((in_use_mask_ & groupmask) == 0) {
michael@0 56 // Initialize this group
michael@0 57 gscore_[key_group] = 0;
michael@0 58 in_use_mask_ |= groupmask;
michael@0 59 }
michael@0 60 score_[ikey] += idelta;
michael@0 61 }
michael@0 62
michael@0 63
michael@0 64 // Return current top three keys
michael@0 65 void Tote::CurrentTopThreeKeys(int* key3) const {
michael@0 66 key3[0] = -1;
michael@0 67 key3[1] = -1;
michael@0 68 key3[2] = -1;
michael@0 69 int score3[3] = {-1, -1, -1};
michael@0 70 uint64 tempmask = in_use_mask_;
michael@0 71 int base = 0;
michael@0 72 while (tempmask != 0) {
michael@0 73 if (tempmask & 1) {
michael@0 74 // Look at four in-use keys
michael@0 75 for (int i = 0; i < 4; ++i) {
michael@0 76 int insert_me = score_[base + i];
michael@0 77 // Favor lower numbers on ties
michael@0 78 if (insert_me > score3[2]) {
michael@0 79 // Insert
michael@0 80 int insert_at = 2;
michael@0 81 if (insert_me > score3[1]) {
michael@0 82 score3[2] = score3[1];
michael@0 83 key3[2] = key3[1];
michael@0 84 insert_at = 1;
michael@0 85 if (insert_me > score3[0]) {
michael@0 86 score3[1] = score3[0];
michael@0 87 key3[1] = key3[0];
michael@0 88 insert_at = 0;
michael@0 89 }
michael@0 90 }
michael@0 91 score3[insert_at] = insert_me;
michael@0 92 key3[insert_at] = base + i;
michael@0 93 }
michael@0 94 }
michael@0 95 }
michael@0 96 tempmask >>= 1;
michael@0 97 base += 4;
michael@0 98 }
michael@0 99 }
michael@0 100
michael@0 101
michael@0 102 // Take a set of <key, value> pairs and tote them up.
michael@0 103 // After explicitly sorting, retrieve top key, value pairs
michael@0 104 // 0xFFFF in key signifies unused
michael@0 105 DocTote::DocTote() {
michael@0 106 // No need to initialize score_ or value_
michael@0 107 incr_count_ = 0;
michael@0 108 sorted_ = 0;
michael@0 109 memset(closepair_, 0, sizeof(closepair_));
michael@0 110 memset(key_, 0xFF, sizeof(key_));
michael@0 111 }
michael@0 112
michael@0 113 DocTote::~DocTote() {
michael@0 114 }
michael@0 115
michael@0 116 void DocTote::Reinit() {
michael@0 117 // No need to initialize score_ or value_
michael@0 118 incr_count_ = 0;
michael@0 119 sorted_ = 0;
michael@0 120 memset(closepair_, 0, sizeof(closepair_));
michael@0 121 memset(key_, 0xFF, sizeof(key_));
michael@0 122 runningscore_.Reinit();
michael@0 123 }
michael@0 124
michael@0 125 // Weight reliability by ibytes
michael@0 126 // Also see three-way associative comments above for Tote
michael@0 127 void DocTote::Add(uint16 ikey, int ibytes,
michael@0 128 int score, int ireliability) {
michael@0 129 ++incr_count_;
michael@0 130
michael@0 131 // Look for existing entry in top 2 positions of 3, times 8 columns
michael@0 132 int sub0 = ikey & 15;
michael@0 133 if (key_[sub0] == ikey) {
michael@0 134 value_[sub0] += ibytes;
michael@0 135 score_[sub0] += score;
michael@0 136 reliability_[sub0] += ireliability * ibytes;
michael@0 137 return;
michael@0 138 }
michael@0 139 // Look for existing entry in other of top 2 positions of 3, times 8 columns
michael@0 140 int sub1 = sub0 ^ 8;
michael@0 141 if (key_[sub1] == ikey) {
michael@0 142 value_[sub1] += ibytes;
michael@0 143 score_[sub1] += score;
michael@0 144 reliability_[sub1] += ireliability * ibytes;
michael@0 145 return;
michael@0 146 }
michael@0 147 // Look for existing entry in third position of 3, times 8 columns
michael@0 148 int sub2 = (ikey & 7) + 16;
michael@0 149 if (key_[sub2] == ikey) {
michael@0 150 value_[sub2] += ibytes;
michael@0 151 score_[sub2] += score;
michael@0 152 reliability_[sub2] += ireliability * ibytes;
michael@0 153 return;
michael@0 154 }
michael@0 155
michael@0 156 // Allocate new entry
michael@0 157 int alloc = -1;
michael@0 158 if (key_[sub0] == kUnusedKey) {
michael@0 159 alloc = sub0;
michael@0 160 } else if (key_[sub1] == kUnusedKey) {
michael@0 161 alloc = sub1;
michael@0 162 } else if (key_[sub2] == kUnusedKey) {
michael@0 163 alloc = sub2;
michael@0 164 } else {
michael@0 165 // All choices allocated, need to replace smallest one
michael@0 166 alloc = sub0;
michael@0 167 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
michael@0 168 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
michael@0 169 }
michael@0 170 key_[alloc] = ikey;
michael@0 171 value_[alloc] = ibytes;
michael@0 172 score_[alloc] = score;
michael@0 173 reliability_[alloc] = ireliability * ibytes;
michael@0 174 return;
michael@0 175 }
michael@0 176
michael@0 177 // Find subscript of a given packed language, or -1
michael@0 178 int DocTote::Find(uint16 ikey) {
michael@0 179 if (sorted_) {
michael@0 180 // Linear search if sorted
michael@0 181 for (int sub = 0; sub < kMaxSize_; ++sub) {
michael@0 182 if (key_[sub] == ikey) {return sub;}
michael@0 183 }
michael@0 184 return -1;
michael@0 185 }
michael@0 186
michael@0 187 // Look for existing entry
michael@0 188 int sub0 = ikey & 15;
michael@0 189 if (key_[sub0] == ikey) {
michael@0 190 return sub0;
michael@0 191 }
michael@0 192 int sub1 = sub0 ^ 8;
michael@0 193 if (key_[sub1] == ikey) {
michael@0 194 return sub1;
michael@0 195 }
michael@0 196 int sub2 = (ikey & 7) + 16;
michael@0 197 if (key_[sub2] == ikey) {
michael@0 198 return sub2;
michael@0 199 }
michael@0 200
michael@0 201 return -1;
michael@0 202 }
michael@0 203
michael@0 204 // Return current top key
michael@0 205 int DocTote::CurrentTopKey() {
michael@0 206 int top_key = 0;
michael@0 207 int top_value = -1;
michael@0 208 for (int sub = 0; sub < kMaxSize_; ++sub) {
michael@0 209 if (key_[sub] == kUnusedKey) {continue;}
michael@0 210 if (top_value < value_[sub]) {
michael@0 211 top_value = value_[sub];
michael@0 212 top_key = key_[sub];
michael@0 213 }
michael@0 214 }
michael@0 215 return top_key;
michael@0 216 }
michael@0 217
michael@0 218
michael@0 219 // Sort first n entries by decreasing order of value
michael@0 220 // If key==0 other fields are not valid, treat value as -1
michael@0 221 void DocTote::Sort(int n) {
michael@0 222 // This is n**2, but n is small
michael@0 223 for (int sub = 0; sub < n; ++sub) {
michael@0 224 if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
michael@0 225
michael@0 226 // Bubble sort key[sub] and entry[sub]
michael@0 227 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
michael@0 228 if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
michael@0 229 if (value_[sub] < value_[sub2]) {
michael@0 230 // swap
michael@0 231 uint16 tmpk = key_[sub];
michael@0 232 key_[sub] = key_[sub2];
michael@0 233 key_[sub2] = tmpk;
michael@0 234
michael@0 235 int tmpv = value_[sub];
michael@0 236 value_[sub] = value_[sub2];
michael@0 237 value_[sub2] = tmpv;
michael@0 238
michael@0 239 double tmps = score_[sub];
michael@0 240 score_[sub] = score_[sub2];
michael@0 241 score_[sub2] = tmps;
michael@0 242
michael@0 243 int tmpr = reliability_[sub];
michael@0 244 reliability_[sub] = reliability_[sub2];
michael@0 245 reliability_[sub2] = tmpr;
michael@0 246 }
michael@0 247 }
michael@0 248 }
michael@0 249 sorted_ = 1;
michael@0 250 }
michael@0 251
michael@0 252 void DocTote::Dump(FILE* f) {
michael@0 253 fprintf(f, "DocTote::Dump\n");
michael@0 254 for (int sub = 0; sub < kMaxSize_; ++sub) {
michael@0 255 if (key_[sub] != kUnusedKey) {
michael@0 256 Language lang = static_cast<Language>(key_[sub]);
michael@0 257 fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
michael@0 258 value_[sub], score_[sub], reliability_[sub]);
michael@0 259 }
michael@0 260 }
michael@0 261 fprintf(f, " %d chunks scored<br>\n", incr_count_);
michael@0 262 }
michael@0 263
michael@0 264 } // End namespace CLD2
michael@0 265

mercurial