browser/components/translation/cld2/internal/tote.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/tote.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,265 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +#include "tote.h"
    1.23 +#include "lang_script.h"    // For LanguageCode in Dump
    1.24 +
    1.25 +#include <stdio.h>
    1.26 +#include <string.h>         // For memset
    1.27 +
    1.28 +namespace CLD2 {
    1.29 +
    1.30 +// Take a set of <key, value> pairs and tote them up.
    1.31 +// After explicitly sorting, retrieve top key, value pairs
    1.32 +// Normal use is key=per-script language and value = probability score
    1.33 +Tote::Tote() {
    1.34 +  in_use_mask_ = 0;
    1.35 +  byte_count_ = 0;
    1.36 +  score_count_ = 0;
    1.37 +  // No need to initialize values
    1.38 +}
    1.39 +
    1.40 +Tote::~Tote() {
    1.41 +}
    1.42 +
    1.43 +void Tote::Reinit() {
    1.44 +  in_use_mask_ = 0;
    1.45 +  byte_count_ = 0;
    1.46 +  score_count_ = 0;
    1.47 +  // No need to initialize values
    1.48 +}
    1.49 +// Increment count of quadgrams/trigrams/unigrams scored
    1.50 +void Tote::AddScoreCount() {
    1.51 +  ++score_count_;
    1.52 +}
    1.53 +
    1.54 +
    1.55 +void Tote::Add(uint8 ikey, int idelta) {
    1.56 +  int key_group = ikey >> 2;
    1.57 +  uint64 groupmask = (1ULL << key_group);
    1.58 +  if ((in_use_mask_ & groupmask) == 0) {
    1.59 +    // Initialize this group
    1.60 +    gscore_[key_group] = 0;
    1.61 +    in_use_mask_ |= groupmask;
    1.62 +  }
    1.63 +  score_[ikey] += idelta;
    1.64 +}
    1.65 +
    1.66 +
    1.67 +// Return current top three keys
    1.68 +void Tote::CurrentTopThreeKeys(int* key3) const {
    1.69 +  key3[0] = -1;
    1.70 +  key3[1] = -1;
    1.71 +  key3[2] = -1;
    1.72 +  int score3[3] = {-1, -1, -1};
    1.73 +  uint64 tempmask = in_use_mask_;
    1.74 +  int base = 0;
    1.75 +  while (tempmask != 0) {
    1.76 +    if (tempmask & 1) {
    1.77 +      // Look at four in-use keys
    1.78 +      for (int i = 0; i < 4; ++i) {
    1.79 +        int insert_me = score_[base + i];
    1.80 +        // Favor lower numbers on ties
    1.81 +        if (insert_me > score3[2]) {
    1.82 +          // Insert
    1.83 +          int insert_at = 2;
    1.84 +          if (insert_me > score3[1]) {
    1.85 +            score3[2] = score3[1];
    1.86 +            key3[2] = key3[1];
    1.87 +            insert_at = 1;
    1.88 +            if (insert_me > score3[0]) {
    1.89 +              score3[1] = score3[0];
    1.90 +              key3[1] = key3[0];
    1.91 +              insert_at = 0;
    1.92 +            }
    1.93 +          }
    1.94 +          score3[insert_at] = insert_me;
    1.95 +          key3[insert_at] = base + i;
    1.96 +        }
    1.97 +      }
    1.98 +    }
    1.99 +    tempmask >>= 1;
   1.100 +    base += 4;
   1.101 +  }
   1.102 +}
   1.103 +
   1.104 +
   1.105 +// Take a set of <key, value> pairs and tote them up.
   1.106 +// After explicitly sorting, retrieve top key, value pairs
   1.107 +// 0xFFFF in key signifies unused
   1.108 +DocTote::DocTote() {
   1.109 +  // No need to initialize score_ or value_
   1.110 +  incr_count_ = 0;
   1.111 +  sorted_ = 0;
   1.112 +  memset(closepair_, 0, sizeof(closepair_));
   1.113 +  memset(key_, 0xFF, sizeof(key_));
   1.114 +}
   1.115 +
   1.116 +DocTote::~DocTote() {
   1.117 +}
   1.118 +
   1.119 +void DocTote::Reinit() {
   1.120 +  // No need to initialize score_ or value_
   1.121 +  incr_count_ = 0;
   1.122 +  sorted_ = 0;
   1.123 +  memset(closepair_, 0, sizeof(closepair_));
   1.124 +  memset(key_, 0xFF, sizeof(key_));
   1.125 +  runningscore_.Reinit();
   1.126 +}
   1.127 +
   1.128 +// Weight reliability by ibytes
   1.129 +// Also see three-way associative comments above for Tote
   1.130 +void DocTote::Add(uint16 ikey, int ibytes,
   1.131 +                              int score, int ireliability) {
   1.132 +  ++incr_count_;
   1.133 +
   1.134 +  // Look for existing entry in top 2 positions of 3, times 8 columns
   1.135 +  int sub0 = ikey & 15;
   1.136 +  if (key_[sub0] == ikey) {
   1.137 +    value_[sub0] += ibytes;
   1.138 +    score_[sub0] += score;
   1.139 +    reliability_[sub0] += ireliability * ibytes;
   1.140 +    return;
   1.141 +  }
   1.142 +  // Look for existing entry in other of top 2 positions of 3, times 8 columns
   1.143 +  int sub1 = sub0 ^ 8;
   1.144 +  if (key_[sub1] == ikey) {
   1.145 +    value_[sub1] += ibytes;
   1.146 +    score_[sub1] += score;
   1.147 +    reliability_[sub1] += ireliability * ibytes;
   1.148 +    return;
   1.149 +  }
   1.150 +  // Look for existing entry in third position of 3, times 8 columns
   1.151 +  int sub2 = (ikey & 7) + 16;
   1.152 +  if (key_[sub2] == ikey) {
   1.153 +    value_[sub2] += ibytes;
   1.154 +    score_[sub2] += score;
   1.155 +    reliability_[sub2] += ireliability * ibytes;
   1.156 +    return;
   1.157 +  }
   1.158 +
   1.159 +  // Allocate new entry
   1.160 +  int alloc = -1;
   1.161 +  if (key_[sub0] == kUnusedKey) {
   1.162 +    alloc = sub0;
   1.163 +  } else if (key_[sub1] == kUnusedKey) {
   1.164 +    alloc = sub1;
   1.165 +  } else if (key_[sub2] == kUnusedKey) {
   1.166 +    alloc = sub2;
   1.167 +  } else {
   1.168 +    // All choices allocated, need to replace smallest one
   1.169 +    alloc = sub0;
   1.170 +    if (value_[sub1] < value_[alloc]) {alloc = sub1;}
   1.171 +    if (value_[sub2] < value_[alloc]) {alloc = sub2;}
   1.172 +  }
   1.173 +  key_[alloc] = ikey;
   1.174 +  value_[alloc] = ibytes;
   1.175 +  score_[alloc] = score;
   1.176 +  reliability_[alloc] = ireliability * ibytes;
   1.177 +  return;
   1.178 +}
   1.179 +
   1.180 +// Find subscript of a given packed language, or -1
   1.181 +int DocTote::Find(uint16 ikey) {
   1.182 +  if (sorted_) {
   1.183 +    // Linear search if sorted
   1.184 +    for (int sub = 0; sub < kMaxSize_; ++sub) {
   1.185 +      if (key_[sub] == ikey) {return sub;}
   1.186 +    }
   1.187 +    return -1;
   1.188 +  }
   1.189 +
   1.190 +  // Look for existing entry
   1.191 +  int sub0 = ikey & 15;
   1.192 +  if (key_[sub0] == ikey) {
   1.193 +    return sub0;
   1.194 +  }
   1.195 +  int sub1 = sub0 ^ 8;
   1.196 +  if (key_[sub1] == ikey) {
   1.197 +    return sub1;
   1.198 +  }
   1.199 +  int sub2 = (ikey & 7) + 16;
   1.200 +  if (key_[sub2] == ikey) {
   1.201 +    return sub2;
   1.202 +  }
   1.203 +
   1.204 +  return -1;
   1.205 +}
   1.206 +
   1.207 +// Return current top key
   1.208 +int DocTote::CurrentTopKey() {
   1.209 +  int top_key = 0;
   1.210 +  int top_value = -1;
   1.211 +  for (int sub = 0; sub < kMaxSize_; ++sub) {
   1.212 +    if (key_[sub] == kUnusedKey) {continue;}
   1.213 +    if (top_value < value_[sub]) {
   1.214 +      top_value = value_[sub];
   1.215 +      top_key = key_[sub];
   1.216 +    }
   1.217 +  }
   1.218 +  return top_key;
   1.219 +}
   1.220 +
   1.221 +
   1.222 +// Sort first n entries by decreasing order of value
   1.223 +// If key==0 other fields are not valid, treat value as -1
   1.224 +void DocTote::Sort(int n) {
   1.225 +  // This is n**2, but n is small
   1.226 +  for (int sub = 0; sub < n; ++sub) {
   1.227 +    if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
   1.228 +
   1.229 +    // Bubble sort key[sub] and entry[sub]
   1.230 +    for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
   1.231 +      if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
   1.232 +      if (value_[sub] < value_[sub2]) {
   1.233 +        // swap
   1.234 +        uint16 tmpk = key_[sub];
   1.235 +        key_[sub] = key_[sub2];
   1.236 +        key_[sub2] = tmpk;
   1.237 +
   1.238 +        int tmpv = value_[sub];
   1.239 +        value_[sub] = value_[sub2];
   1.240 +        value_[sub2] = tmpv;
   1.241 +
   1.242 +        double tmps = score_[sub];
   1.243 +        score_[sub] = score_[sub2];
   1.244 +        score_[sub2] = tmps;
   1.245 +
   1.246 +        int tmpr = reliability_[sub];
   1.247 +        reliability_[sub] = reliability_[sub2];
   1.248 +        reliability_[sub2] = tmpr;
   1.249 +      }
   1.250 +    }
   1.251 +  }
   1.252 +  sorted_ = 1;
   1.253 +}
   1.254 +
   1.255 +void DocTote::Dump(FILE* f) {
   1.256 +  fprintf(f, "DocTote::Dump\n");
   1.257 +  for (int sub = 0; sub < kMaxSize_; ++sub) {
   1.258 +    if (key_[sub] != kUnusedKey) {
   1.259 +      Language lang = static_cast<Language>(key_[sub]);
   1.260 +      fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
   1.261 +              value_[sub], score_[sub], reliability_[sub]);
   1.262 +    }
   1.263 +  }
   1.264 +  fprintf(f, "  %d chunks scored<br>\n", incr_count_);
   1.265 +}
   1.266 +
   1.267 +}       // End namespace CLD2
   1.268 +

mercurial