|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 #include "tote.h" |
|
20 #include "lang_script.h" // For LanguageCode in Dump |
|
21 |
|
22 #include <stdio.h> |
|
23 #include <string.h> // For memset |
|
24 |
|
25 namespace CLD2 { |
|
26 |
|
27 // Take a set of <key, value> pairs and tote them up. |
|
28 // After explicitly sorting, retrieve top key, value pairs |
|
29 // Normal use is key=per-script language and value = probability score |
|
30 Tote::Tote() { |
|
31 in_use_mask_ = 0; |
|
32 byte_count_ = 0; |
|
33 score_count_ = 0; |
|
34 // No need to initialize values |
|
35 } |
|
36 |
|
37 Tote::~Tote() { |
|
38 } |
|
39 |
|
40 void Tote::Reinit() { |
|
41 in_use_mask_ = 0; |
|
42 byte_count_ = 0; |
|
43 score_count_ = 0; |
|
44 // No need to initialize values |
|
45 } |
|
46 // Increment count of quadgrams/trigrams/unigrams scored |
|
47 void Tote::AddScoreCount() { |
|
48 ++score_count_; |
|
49 } |
|
50 |
|
51 |
|
52 void Tote::Add(uint8 ikey, int idelta) { |
|
53 int key_group = ikey >> 2; |
|
54 uint64 groupmask = (1ULL << key_group); |
|
55 if ((in_use_mask_ & groupmask) == 0) { |
|
56 // Initialize this group |
|
57 gscore_[key_group] = 0; |
|
58 in_use_mask_ |= groupmask; |
|
59 } |
|
60 score_[ikey] += idelta; |
|
61 } |
|
62 |
|
63 |
|
64 // Return current top three keys |
|
65 void Tote::CurrentTopThreeKeys(int* key3) const { |
|
66 key3[0] = -1; |
|
67 key3[1] = -1; |
|
68 key3[2] = -1; |
|
69 int score3[3] = {-1, -1, -1}; |
|
70 uint64 tempmask = in_use_mask_; |
|
71 int base = 0; |
|
72 while (tempmask != 0) { |
|
73 if (tempmask & 1) { |
|
74 // Look at four in-use keys |
|
75 for (int i = 0; i < 4; ++i) { |
|
76 int insert_me = score_[base + i]; |
|
77 // Favor lower numbers on ties |
|
78 if (insert_me > score3[2]) { |
|
79 // Insert |
|
80 int insert_at = 2; |
|
81 if (insert_me > score3[1]) { |
|
82 score3[2] = score3[1]; |
|
83 key3[2] = key3[1]; |
|
84 insert_at = 1; |
|
85 if (insert_me > score3[0]) { |
|
86 score3[1] = score3[0]; |
|
87 key3[1] = key3[0]; |
|
88 insert_at = 0; |
|
89 } |
|
90 } |
|
91 score3[insert_at] = insert_me; |
|
92 key3[insert_at] = base + i; |
|
93 } |
|
94 } |
|
95 } |
|
96 tempmask >>= 1; |
|
97 base += 4; |
|
98 } |
|
99 } |
|
100 |
|
101 |
|
102 // Take a set of <key, value> pairs and tote them up. |
|
103 // After explicitly sorting, retrieve top key, value pairs |
|
104 // 0xFFFF in key signifies unused |
|
105 DocTote::DocTote() { |
|
106 // No need to initialize score_ or value_ |
|
107 incr_count_ = 0; |
|
108 sorted_ = 0; |
|
109 memset(closepair_, 0, sizeof(closepair_)); |
|
110 memset(key_, 0xFF, sizeof(key_)); |
|
111 } |
|
112 |
|
113 DocTote::~DocTote() { |
|
114 } |
|
115 |
|
116 void DocTote::Reinit() { |
|
117 // No need to initialize score_ or value_ |
|
118 incr_count_ = 0; |
|
119 sorted_ = 0; |
|
120 memset(closepair_, 0, sizeof(closepair_)); |
|
121 memset(key_, 0xFF, sizeof(key_)); |
|
122 runningscore_.Reinit(); |
|
123 } |
|
124 |
|
125 // Weight reliability by ibytes |
|
126 // Also see three-way associative comments above for Tote |
|
127 void DocTote::Add(uint16 ikey, int ibytes, |
|
128 int score, int ireliability) { |
|
129 ++incr_count_; |
|
130 |
|
131 // Look for existing entry in top 2 positions of 3, times 8 columns |
|
132 int sub0 = ikey & 15; |
|
133 if (key_[sub0] == ikey) { |
|
134 value_[sub0] += ibytes; |
|
135 score_[sub0] += score; |
|
136 reliability_[sub0] += ireliability * ibytes; |
|
137 return; |
|
138 } |
|
139 // Look for existing entry in other of top 2 positions of 3, times 8 columns |
|
140 int sub1 = sub0 ^ 8; |
|
141 if (key_[sub1] == ikey) { |
|
142 value_[sub1] += ibytes; |
|
143 score_[sub1] += score; |
|
144 reliability_[sub1] += ireliability * ibytes; |
|
145 return; |
|
146 } |
|
147 // Look for existing entry in third position of 3, times 8 columns |
|
148 int sub2 = (ikey & 7) + 16; |
|
149 if (key_[sub2] == ikey) { |
|
150 value_[sub2] += ibytes; |
|
151 score_[sub2] += score; |
|
152 reliability_[sub2] += ireliability * ibytes; |
|
153 return; |
|
154 } |
|
155 |
|
156 // Allocate new entry |
|
157 int alloc = -1; |
|
158 if (key_[sub0] == kUnusedKey) { |
|
159 alloc = sub0; |
|
160 } else if (key_[sub1] == kUnusedKey) { |
|
161 alloc = sub1; |
|
162 } else if (key_[sub2] == kUnusedKey) { |
|
163 alloc = sub2; |
|
164 } else { |
|
165 // All choices allocated, need to replace smallest one |
|
166 alloc = sub0; |
|
167 if (value_[sub1] < value_[alloc]) {alloc = sub1;} |
|
168 if (value_[sub2] < value_[alloc]) {alloc = sub2;} |
|
169 } |
|
170 key_[alloc] = ikey; |
|
171 value_[alloc] = ibytes; |
|
172 score_[alloc] = score; |
|
173 reliability_[alloc] = ireliability * ibytes; |
|
174 return; |
|
175 } |
|
176 |
|
177 // Find subscript of a given packed language, or -1 |
|
178 int DocTote::Find(uint16 ikey) { |
|
179 if (sorted_) { |
|
180 // Linear search if sorted |
|
181 for (int sub = 0; sub < kMaxSize_; ++sub) { |
|
182 if (key_[sub] == ikey) {return sub;} |
|
183 } |
|
184 return -1; |
|
185 } |
|
186 |
|
187 // Look for existing entry |
|
188 int sub0 = ikey & 15; |
|
189 if (key_[sub0] == ikey) { |
|
190 return sub0; |
|
191 } |
|
192 int sub1 = sub0 ^ 8; |
|
193 if (key_[sub1] == ikey) { |
|
194 return sub1; |
|
195 } |
|
196 int sub2 = (ikey & 7) + 16; |
|
197 if (key_[sub2] == ikey) { |
|
198 return sub2; |
|
199 } |
|
200 |
|
201 return -1; |
|
202 } |
|
203 |
|
204 // Return current top key |
|
205 int DocTote::CurrentTopKey() { |
|
206 int top_key = 0; |
|
207 int top_value = -1; |
|
208 for (int sub = 0; sub < kMaxSize_; ++sub) { |
|
209 if (key_[sub] == kUnusedKey) {continue;} |
|
210 if (top_value < value_[sub]) { |
|
211 top_value = value_[sub]; |
|
212 top_key = key_[sub]; |
|
213 } |
|
214 } |
|
215 return top_key; |
|
216 } |
|
217 |
|
218 |
|
219 // Sort first n entries by decreasing order of value |
|
220 // If key==0 other fields are not valid, treat value as -1 |
|
221 void DocTote::Sort(int n) { |
|
222 // This is n**2, but n is small |
|
223 for (int sub = 0; sub < n; ++sub) { |
|
224 if (key_[sub] == kUnusedKey) {value_[sub] = -1;} |
|
225 |
|
226 // Bubble sort key[sub] and entry[sub] |
|
227 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { |
|
228 if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} |
|
229 if (value_[sub] < value_[sub2]) { |
|
230 // swap |
|
231 uint16 tmpk = key_[sub]; |
|
232 key_[sub] = key_[sub2]; |
|
233 key_[sub2] = tmpk; |
|
234 |
|
235 int tmpv = value_[sub]; |
|
236 value_[sub] = value_[sub2]; |
|
237 value_[sub2] = tmpv; |
|
238 |
|
239 double tmps = score_[sub]; |
|
240 score_[sub] = score_[sub2]; |
|
241 score_[sub2] = tmps; |
|
242 |
|
243 int tmpr = reliability_[sub]; |
|
244 reliability_[sub] = reliability_[sub2]; |
|
245 reliability_[sub2] = tmpr; |
|
246 } |
|
247 } |
|
248 } |
|
249 sorted_ = 1; |
|
250 } |
|
251 |
|
252 void DocTote::Dump(FILE* f) { |
|
253 fprintf(f, "DocTote::Dump\n"); |
|
254 for (int sub = 0; sub < kMaxSize_; ++sub) { |
|
255 if (key_[sub] != kUnusedKey) { |
|
256 Language lang = static_cast<Language>(key_[sub]); |
|
257 fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), |
|
258 value_[sub], score_[sub], reliability_[sub]); |
|
259 } |
|
260 } |
|
261 fprintf(f, " %d chunks scored<br>\n", incr_count_); |
|
262 } |
|
263 |
|
264 } // End namespace CLD2 |
|
265 |