michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #include "tote.h" michael@0: #include "lang_script.h" // For LanguageCode in Dump michael@0: michael@0: #include michael@0: #include // For memset michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Take a set of pairs and tote them up. michael@0: // After explicitly sorting, retrieve top key, value pairs michael@0: // Normal use is key=per-script language and value = probability score michael@0: Tote::Tote() { michael@0: in_use_mask_ = 0; michael@0: byte_count_ = 0; michael@0: score_count_ = 0; michael@0: // No need to initialize values michael@0: } michael@0: michael@0: Tote::~Tote() { michael@0: } michael@0: michael@0: void Tote::Reinit() { michael@0: in_use_mask_ = 0; michael@0: byte_count_ = 0; michael@0: score_count_ = 0; michael@0: // No need to initialize values michael@0: } michael@0: // Increment count of quadgrams/trigrams/unigrams scored michael@0: void Tote::AddScoreCount() { michael@0: ++score_count_; michael@0: } michael@0: michael@0: michael@0: void Tote::Add(uint8 ikey, int idelta) { michael@0: int key_group = ikey >> 2; michael@0: uint64 groupmask = (1ULL << key_group); michael@0: if ((in_use_mask_ & groupmask) == 0) { michael@0: // Initialize this group michael@0: gscore_[key_group] = 0; michael@0: in_use_mask_ |= groupmask; michael@0: } michael@0: score_[ikey] += idelta; michael@0: } michael@0: michael@0: michael@0: // Return current top three keys michael@0: void Tote::CurrentTopThreeKeys(int* key3) const { michael@0: key3[0] = -1; michael@0: key3[1] = -1; michael@0: key3[2] = -1; michael@0: int score3[3] = {-1, -1, -1}; michael@0: uint64 tempmask = in_use_mask_; michael@0: int base = 0; michael@0: while (tempmask != 0) { michael@0: if (tempmask & 1) { michael@0: // Look at four in-use keys michael@0: for (int i = 0; i < 4; ++i) { michael@0: int insert_me = score_[base + i]; michael@0: // Favor lower numbers on ties michael@0: if (insert_me > score3[2]) { michael@0: // Insert michael@0: int insert_at = 2; michael@0: if (insert_me > score3[1]) { michael@0: score3[2] = score3[1]; michael@0: key3[2] = key3[1]; michael@0: insert_at = 1; michael@0: if (insert_me > score3[0]) { michael@0: score3[1] = score3[0]; michael@0: key3[1] = key3[0]; michael@0: insert_at = 0; michael@0: } michael@0: } michael@0: score3[insert_at] = insert_me; michael@0: key3[insert_at] = base + i; michael@0: } michael@0: } michael@0: } michael@0: tempmask >>= 1; michael@0: base += 4; michael@0: } michael@0: } michael@0: michael@0: michael@0: // Take a set of pairs and tote them up. michael@0: // After explicitly sorting, retrieve top key, value pairs michael@0: // 0xFFFF in key signifies unused michael@0: DocTote::DocTote() { michael@0: // No need to initialize score_ or value_ michael@0: incr_count_ = 0; michael@0: sorted_ = 0; michael@0: memset(closepair_, 0, sizeof(closepair_)); michael@0: memset(key_, 0xFF, sizeof(key_)); michael@0: } michael@0: michael@0: DocTote::~DocTote() { michael@0: } michael@0: michael@0: void DocTote::Reinit() { michael@0: // No need to initialize score_ or value_ michael@0: incr_count_ = 0; michael@0: sorted_ = 0; michael@0: memset(closepair_, 0, sizeof(closepair_)); michael@0: memset(key_, 0xFF, sizeof(key_)); michael@0: runningscore_.Reinit(); michael@0: } michael@0: michael@0: // Weight reliability by ibytes michael@0: // Also see three-way associative comments above for Tote michael@0: void DocTote::Add(uint16 ikey, int ibytes, michael@0: int score, int ireliability) { michael@0: ++incr_count_; michael@0: michael@0: // Look for existing entry in top 2 positions of 3, times 8 columns michael@0: int sub0 = ikey & 15; michael@0: if (key_[sub0] == ikey) { michael@0: value_[sub0] += ibytes; michael@0: score_[sub0] += score; michael@0: reliability_[sub0] += ireliability * ibytes; michael@0: return; michael@0: } michael@0: // Look for existing entry in other of top 2 positions of 3, times 8 columns michael@0: int sub1 = sub0 ^ 8; michael@0: if (key_[sub1] == ikey) { michael@0: value_[sub1] += ibytes; michael@0: score_[sub1] += score; michael@0: reliability_[sub1] += ireliability * ibytes; michael@0: return; michael@0: } michael@0: // Look for existing entry in third position of 3, times 8 columns michael@0: int sub2 = (ikey & 7) + 16; michael@0: if (key_[sub2] == ikey) { michael@0: value_[sub2] += ibytes; michael@0: score_[sub2] += score; michael@0: reliability_[sub2] += ireliability * ibytes; michael@0: return; michael@0: } michael@0: michael@0: // Allocate new entry michael@0: int alloc = -1; michael@0: if (key_[sub0] == kUnusedKey) { michael@0: alloc = sub0; michael@0: } else if (key_[sub1] == kUnusedKey) { michael@0: alloc = sub1; michael@0: } else if (key_[sub2] == kUnusedKey) { michael@0: alloc = sub2; michael@0: } else { michael@0: // All choices allocated, need to replace smallest one michael@0: alloc = sub0; michael@0: if (value_[sub1] < value_[alloc]) {alloc = sub1;} michael@0: if (value_[sub2] < value_[alloc]) {alloc = sub2;} michael@0: } michael@0: key_[alloc] = ikey; michael@0: value_[alloc] = ibytes; michael@0: score_[alloc] = score; michael@0: reliability_[alloc] = ireliability * ibytes; michael@0: return; michael@0: } michael@0: michael@0: // Find subscript of a given packed language, or -1 michael@0: int DocTote::Find(uint16 ikey) { michael@0: if (sorted_) { michael@0: // Linear search if sorted michael@0: for (int sub = 0; sub < kMaxSize_; ++sub) { michael@0: if (key_[sub] == ikey) {return sub;} michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: // Look for existing entry michael@0: int sub0 = ikey & 15; michael@0: if (key_[sub0] == ikey) { michael@0: return sub0; michael@0: } michael@0: int sub1 = sub0 ^ 8; michael@0: if (key_[sub1] == ikey) { michael@0: return sub1; michael@0: } michael@0: int sub2 = (ikey & 7) + 16; michael@0: if (key_[sub2] == ikey) { michael@0: return sub2; michael@0: } michael@0: michael@0: return -1; michael@0: } michael@0: michael@0: // Return current top key michael@0: int DocTote::CurrentTopKey() { michael@0: int top_key = 0; michael@0: int top_value = -1; michael@0: for (int sub = 0; sub < kMaxSize_; ++sub) { michael@0: if (key_[sub] == kUnusedKey) {continue;} michael@0: if (top_value < value_[sub]) { michael@0: top_value = value_[sub]; michael@0: top_key = key_[sub]; michael@0: } michael@0: } michael@0: return top_key; michael@0: } michael@0: michael@0: michael@0: // Sort first n entries by decreasing order of value michael@0: // If key==0 other fields are not valid, treat value as -1 michael@0: void DocTote::Sort(int n) { michael@0: // This is n**2, but n is small michael@0: for (int sub = 0; sub < n; ++sub) { michael@0: if (key_[sub] == kUnusedKey) {value_[sub] = -1;} michael@0: michael@0: // Bubble sort key[sub] and entry[sub] michael@0: for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { michael@0: if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} michael@0: if (value_[sub] < value_[sub2]) { michael@0: // swap michael@0: uint16 tmpk = key_[sub]; michael@0: key_[sub] = key_[sub2]; michael@0: key_[sub2] = tmpk; michael@0: michael@0: int tmpv = value_[sub]; michael@0: value_[sub] = value_[sub2]; michael@0: value_[sub2] = tmpv; michael@0: michael@0: double tmps = score_[sub]; michael@0: score_[sub] = score_[sub2]; michael@0: score_[sub2] = tmps; michael@0: michael@0: int tmpr = reliability_[sub]; michael@0: reliability_[sub] = reliability_[sub2]; michael@0: reliability_[sub2] = tmpr; michael@0: } michael@0: } michael@0: } michael@0: sorted_ = 1; michael@0: } michael@0: michael@0: void DocTote::Dump(FILE* f) { michael@0: fprintf(f, "DocTote::Dump\n"); michael@0: for (int sub = 0; sub < kMaxSize_; ++sub) { michael@0: if (key_[sub] != kUnusedKey) { michael@0: Language lang = static_cast(key_[sub]); michael@0: fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), michael@0: value_[sub], score_[sub], reliability_[sub]); michael@0: } michael@0: } michael@0: fprintf(f, " %d chunks scored
\n", incr_count_); michael@0: } michael@0: michael@0: } // End namespace CLD2 michael@0: