michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #ifndef UTIL_UTF8_OFFSETMAP_H_ michael@0: #define UTIL_UTF8_OFFSETMAP_H_ michael@0: michael@0: #include // for string michael@0: #include "integral_types.h" // for uint32 michael@0: michael@0: // ***************************** OffsetMap ************************** michael@0: // michael@0: // An OffsetMap object is a container for a mapping from offsets in one text michael@0: // buffer A' to offsets in another text buffer A. It is most useful when A' is michael@0: // built from A via substitutions that occasionally do not preserve byte length. michael@0: // michael@0: // A series of operators are used to build the correspondence map, then michael@0: // calls can be made to map an offset in A' to an offset in A, or vice versa. michael@0: // The map starts with offset 0 in A corresponding to offset 0 in A'. michael@0: // The mapping is then built sequentially, adding on byte ranges that are michael@0: // identical in A and A', byte ranges that are inserted in A', and byte ranges michael@0: // that are deleted from A. All bytes beyond those specified when building the michael@0: // map are assumed to correspond, i.e. a Copy(infinity) is assumed at the michael@0: // end of the map. michael@0: // michael@0: // The internal data structure records positions at which bytes are added or michael@0: // deleted. Using the map is O(1) when increasing the A' or A offset michael@0: // monotonically, and O(n) when accessing random offsets, where n is the michael@0: // number of differences. michael@0: // michael@0: michael@0: namespace CLD2 { michael@0: michael@0: class OffsetMap { michael@0: public: michael@0: // Constructor, destructor michael@0: OffsetMap(); michael@0: ~OffsetMap(); michael@0: michael@0: // Clear the map michael@0: void Clear(); michael@0: michael@0: // Add to mapping from A to A', specifying how many next bytes correspond michael@0: // in A and A' michael@0: void Copy(int bytes); michael@0: michael@0: // Add to mapping from A to A', specifying how many next bytes are michael@0: // inserted in A' while not advancing in A at all michael@0: void Insert(int bytes); michael@0: michael@0: // Add to mapping from A to A', specifying how many next bytes are michael@0: // deleted from A while not advancing in A' at all michael@0: void Delete(int bytes); michael@0: michael@0: // Print map to file, for debugging michael@0: void Printmap(const char* filename); michael@0: michael@0: // [Finish building map,] Re-position to offset 0 michael@0: // This call is optional; MapForward and MapBack finish building the map michael@0: // if necessary michael@0: void Reset(); michael@0: michael@0: // Map an offset in A' to the corresponding offset in A michael@0: int MapBack(int aprimeoffset); michael@0: michael@0: // Map an offset in A to the corresponding offset in A' michael@0: int MapForward(int aoffset); michael@0: michael@0: // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is michael@0: // from A' to A'' and h is from A to A''. michael@0: // michael@0: // Note that g->MoveForward(f->MoveForward(aoffset)) always equals michael@0: // to h->MoveForward(aoffset), while michael@0: // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals michael@0: // to h->MoveBack(aprimeprimeoffset). This happens when deletion in michael@0: // f and insertion in g are at the same place. For example, michael@0: // michael@0: // A 1 2 3 4 michael@0: // ^ | ^ ^ michael@0: // | | / | f michael@0: // v vv v michael@0: // A' 1' 2' 3' michael@0: // ^ ^^ ^ michael@0: // | | \ | g michael@0: // v | v v michael@0: // A'' 1'' 2'' 3'' 4'' michael@0: // michael@0: // results in: michael@0: // michael@0: // A 1 2 3 4 michael@0: // ^ ^\ ^ ^ michael@0: // | | \ | | h michael@0: // v | vv v michael@0: // A'' 1'' 2'' 3'' 4'' michael@0: // michael@0: // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in michael@0: // the latter figure. michael@0: static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h); michael@0: michael@0: // For debugging only; writes to stderr michael@0: void DumpWindow(); michael@0: michael@0: // For testing only -- force a mapping michael@0: void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset); michael@0: michael@0: private: michael@0: enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP}; michael@0: michael@0: void Flush(); michael@0: void FlushAll(); michael@0: void MaybeFlushAll(); michael@0: void Emit(MapOp op, int len); michael@0: michael@0: void SetLeft(); michael@0: void SetRight(); michael@0: michael@0: // Back up over previous range, 1..5 bytes michael@0: // Return subscript at the beginning of that. Pins at 0 michael@0: int Backup(int sub); michael@0: michael@0: // Parse next range, 1..5 bytes michael@0: // Return subscript just off the end of that michael@0: int ParseNext(int sub, MapOp* op, int* length); michael@0: michael@0: // Parse previous range, 1..5 bytes michael@0: // Return current subscript michael@0: int ParsePrevious(int sub, MapOp* op, int* length); michael@0: michael@0: void PrintPosition(const char* str); michael@0: michael@0: bool MoveRight(); // Returns true if OK michael@0: bool MoveLeft(); // Returns true if OK michael@0: void DumpString(); michael@0: michael@0: // Copies insert operations from source to dest. Returns true if no michael@0: // other operations are found. michael@0: static bool CopyInserts(OffsetMap* source, OffsetMap* dest); michael@0: michael@0: // Copies delete operations from source to dest. Returns true if no other michael@0: // operations are found. michael@0: static bool CopyDeletes(OffsetMap* source, OffsetMap* dest); michael@0: michael@0: std::string diffs_; michael@0: MapOp pending_op_; michael@0: uint32 pending_length_; michael@0: michael@0: // Offsets in the ranges below correspond to each other, with A' = A + diff michael@0: int next_diff_sub_; michael@0: int current_lo_aoffset_; michael@0: int current_hi_aoffset_; michael@0: int current_lo_aprimeoffset_; michael@0: int current_hi_aprimeoffset_; michael@0: int current_diff_; michael@0: int max_aoffset_; michael@0: int max_aprimeoffset_; michael@0: }; michael@0: michael@0: } // namespace CLD2 michael@0: michael@0: #endif // UTIL_UTF8_OFFSETMAP_H_ michael@0: