1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/offsetmap.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,175 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#ifndef UTIL_UTF8_OFFSETMAP_H_ 1.23 +#define UTIL_UTF8_OFFSETMAP_H_ 1.24 + 1.25 +#include <string> // for string 1.26 +#include "integral_types.h" // for uint32 1.27 + 1.28 +// ***************************** OffsetMap ************************** 1.29 +// 1.30 +// An OffsetMap object is a container for a mapping from offsets in one text 1.31 +// buffer A' to offsets in another text buffer A. It is most useful when A' is 1.32 +// built from A via substitutions that occasionally do not preserve byte length. 1.33 +// 1.34 +// A series of operators are used to build the correspondence map, then 1.35 +// calls can be made to map an offset in A' to an offset in A, or vice versa. 1.36 +// The map starts with offset 0 in A corresponding to offset 0 in A'. 1.37 +// The mapping is then built sequentially, adding on byte ranges that are 1.38 +// identical in A and A', byte ranges that are inserted in A', and byte ranges 1.39 +// that are deleted from A. All bytes beyond those specified when building the 1.40 +// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the 1.41 +// end of the map. 1.42 +// 1.43 +// The internal data structure records positions at which bytes are added or 1.44 +// deleted. Using the map is O(1) when increasing the A' or A offset 1.45 +// monotonically, and O(n) when accessing random offsets, where n is the 1.46 +// number of differences. 1.47 +// 1.48 + 1.49 +namespace CLD2 { 1.50 + 1.51 +class OffsetMap { 1.52 + public: 1.53 + // Constructor, destructor 1.54 + OffsetMap(); 1.55 + ~OffsetMap(); 1.56 + 1.57 + // Clear the map 1.58 + void Clear(); 1.59 + 1.60 + // Add to mapping from A to A', specifying how many next bytes correspond 1.61 + // in A and A' 1.62 + void Copy(int bytes); 1.63 + 1.64 + // Add to mapping from A to A', specifying how many next bytes are 1.65 + // inserted in A' while not advancing in A at all 1.66 + void Insert(int bytes); 1.67 + 1.68 + // Add to mapping from A to A', specifying how many next bytes are 1.69 + // deleted from A while not advancing in A' at all 1.70 + void Delete(int bytes); 1.71 + 1.72 + // Print map to file, for debugging 1.73 + void Printmap(const char* filename); 1.74 + 1.75 + // [Finish building map,] Re-position to offset 0 1.76 + // This call is optional; MapForward and MapBack finish building the map 1.77 + // if necessary 1.78 + void Reset(); 1.79 + 1.80 + // Map an offset in A' to the corresponding offset in A 1.81 + int MapBack(int aprimeoffset); 1.82 + 1.83 + // Map an offset in A to the corresponding offset in A' 1.84 + int MapForward(int aoffset); 1.85 + 1.86 + // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is 1.87 + // from A' to A'' and h is from A to A''. 1.88 + // 1.89 + // Note that g->MoveForward(f->MoveForward(aoffset)) always equals 1.90 + // to h->MoveForward(aoffset), while 1.91 + // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals 1.92 + // to h->MoveBack(aprimeprimeoffset). This happens when deletion in 1.93 + // f and insertion in g are at the same place. For example, 1.94 + // 1.95 + // A 1 2 3 4 1.96 + // ^ | ^ ^ 1.97 + // | | / | f 1.98 + // v vv v 1.99 + // A' 1' 2' 3' 1.100 + // ^ ^^ ^ 1.101 + // | | \ | g 1.102 + // v | v v 1.103 + // A'' 1'' 2'' 3'' 4'' 1.104 + // 1.105 + // results in: 1.106 + // 1.107 + // A 1 2 3 4 1.108 + // ^ ^\ ^ ^ 1.109 + // | | \ | | h 1.110 + // v | vv v 1.111 + // A'' 1'' 2'' 3'' 4'' 1.112 + // 1.113 + // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in 1.114 + // the latter figure. 1.115 + static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h); 1.116 + 1.117 + // For debugging only; writes to stderr 1.118 + void DumpWindow(); 1.119 + 1.120 + // For testing only -- force a mapping 1.121 + void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset); 1.122 + 1.123 + private: 1.124 + enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP}; 1.125 + 1.126 + void Flush(); 1.127 + void FlushAll(); 1.128 + void MaybeFlushAll(); 1.129 + void Emit(MapOp op, int len); 1.130 + 1.131 + void SetLeft(); 1.132 + void SetRight(); 1.133 + 1.134 + // Back up over previous range, 1..5 bytes 1.135 + // Return subscript at the beginning of that. Pins at 0 1.136 + int Backup(int sub); 1.137 + 1.138 + // Parse next range, 1..5 bytes 1.139 + // Return subscript just off the end of that 1.140 + int ParseNext(int sub, MapOp* op, int* length); 1.141 + 1.142 + // Parse previous range, 1..5 bytes 1.143 + // Return current subscript 1.144 + int ParsePrevious(int sub, MapOp* op, int* length); 1.145 + 1.146 + void PrintPosition(const char* str); 1.147 + 1.148 + bool MoveRight(); // Returns true if OK 1.149 + bool MoveLeft(); // Returns true if OK 1.150 + void DumpString(); 1.151 + 1.152 + // Copies insert operations from source to dest. Returns true if no 1.153 + // other operations are found. 1.154 + static bool CopyInserts(OffsetMap* source, OffsetMap* dest); 1.155 + 1.156 + // Copies delete operations from source to dest. Returns true if no other 1.157 + // operations are found. 1.158 + static bool CopyDeletes(OffsetMap* source, OffsetMap* dest); 1.159 + 1.160 + std::string diffs_; 1.161 + MapOp pending_op_; 1.162 + uint32 pending_length_; 1.163 + 1.164 + // Offsets in the ranges below correspond to each other, with A' = A + diff 1.165 + int next_diff_sub_; 1.166 + int current_lo_aoffset_; 1.167 + int current_hi_aoffset_; 1.168 + int current_lo_aprimeoffset_; 1.169 + int current_hi_aprimeoffset_; 1.170 + int current_diff_; 1.171 + int max_aoffset_; 1.172 + int max_aprimeoffset_; 1.173 +}; 1.174 + 1.175 +} // namespace CLD2 1.176 + 1.177 +#endif // UTIL_UTF8_OFFSETMAP_H_ 1.178 +