Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | #include "tote.h" |
michael@0 | 20 | #include "lang_script.h" // For LanguageCode in Dump |
michael@0 | 21 | |
michael@0 | 22 | #include <stdio.h> |
michael@0 | 23 | #include <string.h> // For memset |
michael@0 | 24 | |
michael@0 | 25 | namespace CLD2 { |
michael@0 | 26 | |
michael@0 | 27 | // Take a set of <key, value> pairs and tote them up. |
michael@0 | 28 | // After explicitly sorting, retrieve top key, value pairs |
michael@0 | 29 | // Normal use is key=per-script language and value = probability score |
michael@0 | 30 | Tote::Tote() { |
michael@0 | 31 | in_use_mask_ = 0; |
michael@0 | 32 | byte_count_ = 0; |
michael@0 | 33 | score_count_ = 0; |
michael@0 | 34 | // No need to initialize values |
michael@0 | 35 | } |
michael@0 | 36 | |
michael@0 | 37 | Tote::~Tote() { |
michael@0 | 38 | } |
michael@0 | 39 | |
michael@0 | 40 | void Tote::Reinit() { |
michael@0 | 41 | in_use_mask_ = 0; |
michael@0 | 42 | byte_count_ = 0; |
michael@0 | 43 | score_count_ = 0; |
michael@0 | 44 | // No need to initialize values |
michael@0 | 45 | } |
michael@0 | 46 | // Increment count of quadgrams/trigrams/unigrams scored |
michael@0 | 47 | void Tote::AddScoreCount() { |
michael@0 | 48 | ++score_count_; |
michael@0 | 49 | } |
michael@0 | 50 | |
michael@0 | 51 | |
michael@0 | 52 | void Tote::Add(uint8 ikey, int idelta) { |
michael@0 | 53 | int key_group = ikey >> 2; |
michael@0 | 54 | uint64 groupmask = (1ULL << key_group); |
michael@0 | 55 | if ((in_use_mask_ & groupmask) == 0) { |
michael@0 | 56 | // Initialize this group |
michael@0 | 57 | gscore_[key_group] = 0; |
michael@0 | 58 | in_use_mask_ |= groupmask; |
michael@0 | 59 | } |
michael@0 | 60 | score_[ikey] += idelta; |
michael@0 | 61 | } |
michael@0 | 62 | |
michael@0 | 63 | |
michael@0 | 64 | // Return current top three keys |
michael@0 | 65 | void Tote::CurrentTopThreeKeys(int* key3) const { |
michael@0 | 66 | key3[0] = -1; |
michael@0 | 67 | key3[1] = -1; |
michael@0 | 68 | key3[2] = -1; |
michael@0 | 69 | int score3[3] = {-1, -1, -1}; |
michael@0 | 70 | uint64 tempmask = in_use_mask_; |
michael@0 | 71 | int base = 0; |
michael@0 | 72 | while (tempmask != 0) { |
michael@0 | 73 | if (tempmask & 1) { |
michael@0 | 74 | // Look at four in-use keys |
michael@0 | 75 | for (int i = 0; i < 4; ++i) { |
michael@0 | 76 | int insert_me = score_[base + i]; |
michael@0 | 77 | // Favor lower numbers on ties |
michael@0 | 78 | if (insert_me > score3[2]) { |
michael@0 | 79 | // Insert |
michael@0 | 80 | int insert_at = 2; |
michael@0 | 81 | if (insert_me > score3[1]) { |
michael@0 | 82 | score3[2] = score3[1]; |
michael@0 | 83 | key3[2] = key3[1]; |
michael@0 | 84 | insert_at = 1; |
michael@0 | 85 | if (insert_me > score3[0]) { |
michael@0 | 86 | score3[1] = score3[0]; |
michael@0 | 87 | key3[1] = key3[0]; |
michael@0 | 88 | insert_at = 0; |
michael@0 | 89 | } |
michael@0 | 90 | } |
michael@0 | 91 | score3[insert_at] = insert_me; |
michael@0 | 92 | key3[insert_at] = base + i; |
michael@0 | 93 | } |
michael@0 | 94 | } |
michael@0 | 95 | } |
michael@0 | 96 | tempmask >>= 1; |
michael@0 | 97 | base += 4; |
michael@0 | 98 | } |
michael@0 | 99 | } |
michael@0 | 100 | |
michael@0 | 101 | |
michael@0 | 102 | // Take a set of <key, value> pairs and tote them up. |
michael@0 | 103 | // After explicitly sorting, retrieve top key, value pairs |
michael@0 | 104 | // 0xFFFF in key signifies unused |
michael@0 | 105 | DocTote::DocTote() { |
michael@0 | 106 | // No need to initialize score_ or value_ |
michael@0 | 107 | incr_count_ = 0; |
michael@0 | 108 | sorted_ = 0; |
michael@0 | 109 | memset(closepair_, 0, sizeof(closepair_)); |
michael@0 | 110 | memset(key_, 0xFF, sizeof(key_)); |
michael@0 | 111 | } |
michael@0 | 112 | |
michael@0 | 113 | DocTote::~DocTote() { |
michael@0 | 114 | } |
michael@0 | 115 | |
michael@0 | 116 | void DocTote::Reinit() { |
michael@0 | 117 | // No need to initialize score_ or value_ |
michael@0 | 118 | incr_count_ = 0; |
michael@0 | 119 | sorted_ = 0; |
michael@0 | 120 | memset(closepair_, 0, sizeof(closepair_)); |
michael@0 | 121 | memset(key_, 0xFF, sizeof(key_)); |
michael@0 | 122 | runningscore_.Reinit(); |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | // Weight reliability by ibytes |
michael@0 | 126 | // Also see three-way associative comments above for Tote |
michael@0 | 127 | void DocTote::Add(uint16 ikey, int ibytes, |
michael@0 | 128 | int score, int ireliability) { |
michael@0 | 129 | ++incr_count_; |
michael@0 | 130 | |
michael@0 | 131 | // Look for existing entry in top 2 positions of 3, times 8 columns |
michael@0 | 132 | int sub0 = ikey & 15; |
michael@0 | 133 | if (key_[sub0] == ikey) { |
michael@0 | 134 | value_[sub0] += ibytes; |
michael@0 | 135 | score_[sub0] += score; |
michael@0 | 136 | reliability_[sub0] += ireliability * ibytes; |
michael@0 | 137 | return; |
michael@0 | 138 | } |
michael@0 | 139 | // Look for existing entry in other of top 2 positions of 3, times 8 columns |
michael@0 | 140 | int sub1 = sub0 ^ 8; |
michael@0 | 141 | if (key_[sub1] == ikey) { |
michael@0 | 142 | value_[sub1] += ibytes; |
michael@0 | 143 | score_[sub1] += score; |
michael@0 | 144 | reliability_[sub1] += ireliability * ibytes; |
michael@0 | 145 | return; |
michael@0 | 146 | } |
michael@0 | 147 | // Look for existing entry in third position of 3, times 8 columns |
michael@0 | 148 | int sub2 = (ikey & 7) + 16; |
michael@0 | 149 | if (key_[sub2] == ikey) { |
michael@0 | 150 | value_[sub2] += ibytes; |
michael@0 | 151 | score_[sub2] += score; |
michael@0 | 152 | reliability_[sub2] += ireliability * ibytes; |
michael@0 | 153 | return; |
michael@0 | 154 | } |
michael@0 | 155 | |
michael@0 | 156 | // Allocate new entry |
michael@0 | 157 | int alloc = -1; |
michael@0 | 158 | if (key_[sub0] == kUnusedKey) { |
michael@0 | 159 | alloc = sub0; |
michael@0 | 160 | } else if (key_[sub1] == kUnusedKey) { |
michael@0 | 161 | alloc = sub1; |
michael@0 | 162 | } else if (key_[sub2] == kUnusedKey) { |
michael@0 | 163 | alloc = sub2; |
michael@0 | 164 | } else { |
michael@0 | 165 | // All choices allocated, need to replace smallest one |
michael@0 | 166 | alloc = sub0; |
michael@0 | 167 | if (value_[sub1] < value_[alloc]) {alloc = sub1;} |
michael@0 | 168 | if (value_[sub2] < value_[alloc]) {alloc = sub2;} |
michael@0 | 169 | } |
michael@0 | 170 | key_[alloc] = ikey; |
michael@0 | 171 | value_[alloc] = ibytes; |
michael@0 | 172 | score_[alloc] = score; |
michael@0 | 173 | reliability_[alloc] = ireliability * ibytes; |
michael@0 | 174 | return; |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | // Find subscript of a given packed language, or -1 |
michael@0 | 178 | int DocTote::Find(uint16 ikey) { |
michael@0 | 179 | if (sorted_) { |
michael@0 | 180 | // Linear search if sorted |
michael@0 | 181 | for (int sub = 0; sub < kMaxSize_; ++sub) { |
michael@0 | 182 | if (key_[sub] == ikey) {return sub;} |
michael@0 | 183 | } |
michael@0 | 184 | return -1; |
michael@0 | 185 | } |
michael@0 | 186 | |
michael@0 | 187 | // Look for existing entry |
michael@0 | 188 | int sub0 = ikey & 15; |
michael@0 | 189 | if (key_[sub0] == ikey) { |
michael@0 | 190 | return sub0; |
michael@0 | 191 | } |
michael@0 | 192 | int sub1 = sub0 ^ 8; |
michael@0 | 193 | if (key_[sub1] == ikey) { |
michael@0 | 194 | return sub1; |
michael@0 | 195 | } |
michael@0 | 196 | int sub2 = (ikey & 7) + 16; |
michael@0 | 197 | if (key_[sub2] == ikey) { |
michael@0 | 198 | return sub2; |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | return -1; |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | // Return current top key |
michael@0 | 205 | int DocTote::CurrentTopKey() { |
michael@0 | 206 | int top_key = 0; |
michael@0 | 207 | int top_value = -1; |
michael@0 | 208 | for (int sub = 0; sub < kMaxSize_; ++sub) { |
michael@0 | 209 | if (key_[sub] == kUnusedKey) {continue;} |
michael@0 | 210 | if (top_value < value_[sub]) { |
michael@0 | 211 | top_value = value_[sub]; |
michael@0 | 212 | top_key = key_[sub]; |
michael@0 | 213 | } |
michael@0 | 214 | } |
michael@0 | 215 | return top_key; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | |
michael@0 | 219 | // Sort first n entries by decreasing order of value |
michael@0 | 220 | // If key==0 other fields are not valid, treat value as -1 |
michael@0 | 221 | void DocTote::Sort(int n) { |
michael@0 | 222 | // This is n**2, but n is small |
michael@0 | 223 | for (int sub = 0; sub < n; ++sub) { |
michael@0 | 224 | if (key_[sub] == kUnusedKey) {value_[sub] = -1;} |
michael@0 | 225 | |
michael@0 | 226 | // Bubble sort key[sub] and entry[sub] |
michael@0 | 227 | for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { |
michael@0 | 228 | if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} |
michael@0 | 229 | if (value_[sub] < value_[sub2]) { |
michael@0 | 230 | // swap |
michael@0 | 231 | uint16 tmpk = key_[sub]; |
michael@0 | 232 | key_[sub] = key_[sub2]; |
michael@0 | 233 | key_[sub2] = tmpk; |
michael@0 | 234 | |
michael@0 | 235 | int tmpv = value_[sub]; |
michael@0 | 236 | value_[sub] = value_[sub2]; |
michael@0 | 237 | value_[sub2] = tmpv; |
michael@0 | 238 | |
michael@0 | 239 | double tmps = score_[sub]; |
michael@0 | 240 | score_[sub] = score_[sub2]; |
michael@0 | 241 | score_[sub2] = tmps; |
michael@0 | 242 | |
michael@0 | 243 | int tmpr = reliability_[sub]; |
michael@0 | 244 | reliability_[sub] = reliability_[sub2]; |
michael@0 | 245 | reliability_[sub2] = tmpr; |
michael@0 | 246 | } |
michael@0 | 247 | } |
michael@0 | 248 | } |
michael@0 | 249 | sorted_ = 1; |
michael@0 | 250 | } |
michael@0 | 251 | |
michael@0 | 252 | void DocTote::Dump(FILE* f) { |
michael@0 | 253 | fprintf(f, "DocTote::Dump\n"); |
michael@0 | 254 | for (int sub = 0; sub < kMaxSize_; ++sub) { |
michael@0 | 255 | if (key_[sub] != kUnusedKey) { |
michael@0 | 256 | Language lang = static_cast<Language>(key_[sub]); |
michael@0 | 257 | fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), |
michael@0 | 258 | value_[sub], score_[sub], reliability_[sub]); |
michael@0 | 259 | } |
michael@0 | 260 | } |
michael@0 | 261 | fprintf(f, " %d chunks scored<br>\n", incr_count_); |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | } // End namespace CLD2 |
michael@0 | 265 |