browser/components/translation/cld2/internal/tote.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    19 #include "tote.h"
    20 #include "lang_script.h"    // For LanguageCode in Dump
    22 #include <stdio.h>
    23 #include <string.h>         // For memset
    25 namespace CLD2 {
    27 // Take a set of <key, value> pairs and tote them up.
    28 // After explicitly sorting, retrieve top key, value pairs
    29 // Normal use is key=per-script language and value = probability score
    30 Tote::Tote() {
    31   in_use_mask_ = 0;
    32   byte_count_ = 0;
    33   score_count_ = 0;
    34   // No need to initialize values
    35 }
    37 Tote::~Tote() {
    38 }
    40 void Tote::Reinit() {
    41   in_use_mask_ = 0;
    42   byte_count_ = 0;
    43   score_count_ = 0;
    44   // No need to initialize values
    45 }
    46 // Increment count of quadgrams/trigrams/unigrams scored
    47 void Tote::AddScoreCount() {
    48   ++score_count_;
    49 }
    52 void Tote::Add(uint8 ikey, int idelta) {
    53   int key_group = ikey >> 2;
    54   uint64 groupmask = (1ULL << key_group);
    55   if ((in_use_mask_ & groupmask) == 0) {
    56     // Initialize this group
    57     gscore_[key_group] = 0;
    58     in_use_mask_ |= groupmask;
    59   }
    60   score_[ikey] += idelta;
    61 }
    64 // Return current top three keys
    65 void Tote::CurrentTopThreeKeys(int* key3) const {
    66   key3[0] = -1;
    67   key3[1] = -1;
    68   key3[2] = -1;
    69   int score3[3] = {-1, -1, -1};
    70   uint64 tempmask = in_use_mask_;
    71   int base = 0;
    72   while (tempmask != 0) {
    73     if (tempmask & 1) {
    74       // Look at four in-use keys
    75       for (int i = 0; i < 4; ++i) {
    76         int insert_me = score_[base + i];
    77         // Favor lower numbers on ties
    78         if (insert_me > score3[2]) {
    79           // Insert
    80           int insert_at = 2;
    81           if (insert_me > score3[1]) {
    82             score3[2] = score3[1];
    83             key3[2] = key3[1];
    84             insert_at = 1;
    85             if (insert_me > score3[0]) {
    86               score3[1] = score3[0];
    87               key3[1] = key3[0];
    88               insert_at = 0;
    89             }
    90           }
    91           score3[insert_at] = insert_me;
    92           key3[insert_at] = base + i;
    93         }
    94       }
    95     }
    96     tempmask >>= 1;
    97     base += 4;
    98   }
    99 }
   102 // Take a set of <key, value> pairs and tote them up.
   103 // After explicitly sorting, retrieve top key, value pairs
   104 // 0xFFFF in key signifies unused
   105 DocTote::DocTote() {
   106   // No need to initialize score_ or value_
   107   incr_count_ = 0;
   108   sorted_ = 0;
   109   memset(closepair_, 0, sizeof(closepair_));
   110   memset(key_, 0xFF, sizeof(key_));
   111 }
   113 DocTote::~DocTote() {
   114 }
   116 void DocTote::Reinit() {
   117   // No need to initialize score_ or value_
   118   incr_count_ = 0;
   119   sorted_ = 0;
   120   memset(closepair_, 0, sizeof(closepair_));
   121   memset(key_, 0xFF, sizeof(key_));
   122   runningscore_.Reinit();
   123 }
   125 // Weight reliability by ibytes
   126 // Also see three-way associative comments above for Tote
   127 void DocTote::Add(uint16 ikey, int ibytes,
   128                               int score, int ireliability) {
   129   ++incr_count_;
   131   // Look for existing entry in top 2 positions of 3, times 8 columns
   132   int sub0 = ikey & 15;
   133   if (key_[sub0] == ikey) {
   134     value_[sub0] += ibytes;
   135     score_[sub0] += score;
   136     reliability_[sub0] += ireliability * ibytes;
   137     return;
   138   }
   139   // Look for existing entry in other of top 2 positions of 3, times 8 columns
   140   int sub1 = sub0 ^ 8;
   141   if (key_[sub1] == ikey) {
   142     value_[sub1] += ibytes;
   143     score_[sub1] += score;
   144     reliability_[sub1] += ireliability * ibytes;
   145     return;
   146   }
   147   // Look for existing entry in third position of 3, times 8 columns
   148   int sub2 = (ikey & 7) + 16;
   149   if (key_[sub2] == ikey) {
   150     value_[sub2] += ibytes;
   151     score_[sub2] += score;
   152     reliability_[sub2] += ireliability * ibytes;
   153     return;
   154   }
   156   // Allocate new entry
   157   int alloc = -1;
   158   if (key_[sub0] == kUnusedKey) {
   159     alloc = sub0;
   160   } else if (key_[sub1] == kUnusedKey) {
   161     alloc = sub1;
   162   } else if (key_[sub2] == kUnusedKey) {
   163     alloc = sub2;
   164   } else {
   165     // All choices allocated, need to replace smallest one
   166     alloc = sub0;
   167     if (value_[sub1] < value_[alloc]) {alloc = sub1;}
   168     if (value_[sub2] < value_[alloc]) {alloc = sub2;}
   169   }
   170   key_[alloc] = ikey;
   171   value_[alloc] = ibytes;
   172   score_[alloc] = score;
   173   reliability_[alloc] = ireliability * ibytes;
   174   return;
   175 }
   177 // Find subscript of a given packed language, or -1
   178 int DocTote::Find(uint16 ikey) {
   179   if (sorted_) {
   180     // Linear search if sorted
   181     for (int sub = 0; sub < kMaxSize_; ++sub) {
   182       if (key_[sub] == ikey) {return sub;}
   183     }
   184     return -1;
   185   }
   187   // Look for existing entry
   188   int sub0 = ikey & 15;
   189   if (key_[sub0] == ikey) {
   190     return sub0;
   191   }
   192   int sub1 = sub0 ^ 8;
   193   if (key_[sub1] == ikey) {
   194     return sub1;
   195   }
   196   int sub2 = (ikey & 7) + 16;
   197   if (key_[sub2] == ikey) {
   198     return sub2;
   199   }
   201   return -1;
   202 }
   204 // Return current top key
   205 int DocTote::CurrentTopKey() {
   206   int top_key = 0;
   207   int top_value = -1;
   208   for (int sub = 0; sub < kMaxSize_; ++sub) {
   209     if (key_[sub] == kUnusedKey) {continue;}
   210     if (top_value < value_[sub]) {
   211       top_value = value_[sub];
   212       top_key = key_[sub];
   213     }
   214   }
   215   return top_key;
   216 }
   219 // Sort first n entries by decreasing order of value
   220 // If key==0 other fields are not valid, treat value as -1
   221 void DocTote::Sort(int n) {
   222   // This is n**2, but n is small
   223   for (int sub = 0; sub < n; ++sub) {
   224     if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
   226     // Bubble sort key[sub] and entry[sub]
   227     for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
   228       if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
   229       if (value_[sub] < value_[sub2]) {
   230         // swap
   231         uint16 tmpk = key_[sub];
   232         key_[sub] = key_[sub2];
   233         key_[sub2] = tmpk;
   235         int tmpv = value_[sub];
   236         value_[sub] = value_[sub2];
   237         value_[sub2] = tmpv;
   239         double tmps = score_[sub];
   240         score_[sub] = score_[sub2];
   241         score_[sub2] = tmps;
   243         int tmpr = reliability_[sub];
   244         reliability_[sub] = reliability_[sub2];
   245         reliability_[sub2] = tmpr;
   246       }
   247     }
   248   }
   249   sorted_ = 1;
   250 }
   252 void DocTote::Dump(FILE* f) {
   253   fprintf(f, "DocTote::Dump\n");
   254   for (int sub = 0; sub < kMaxSize_; ++sub) {
   255     if (key_[sub] != kUnusedKey) {
   256       Language lang = static_cast<Language>(key_[sub]);
   257       fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
   258               value_[sub], score_[sub], reliability_[sub]);
   259     }
   260   }
   261   fprintf(f, "  %d chunks scored<br>\n", incr_count_);
   262 }
   264 }       // End namespace CLD2

mercurial