michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // State Table follower for scanning UTF-8 strings without converting to michael@0: // 32- or 16-bit Unicode values. michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #ifndef UTIL_UTF8_UTF8STATETABLE_H_ michael@0: #define UTIL_UTF8_UTF8STATETABLE_H_ michael@0: michael@0: #include michael@0: #include "integral_types.h" // for uint8, uint32, uint16 michael@0: #include "stringpiece.h" michael@0: michael@0: michael@0: namespace CLD2 { michael@0: michael@0: class OffsetMap; michael@0: michael@0: michael@0: // These four-byte entries compactly encode how many bytes 0..255 to delete michael@0: // in making a string replacement, how many bytes to add 0..255, and the offset michael@0: // 0..64k-1 of the replacement string in remap_string. michael@0: struct RemapEntry { michael@0: uint8 delete_bytes; michael@0: uint8 add_bytes; michael@0: uint16 bytes_offset; michael@0: }; michael@0: michael@0: // Exit type codes for state tables. All but the first get stuffed into michael@0: // signed one-byte entries. The first is only generated by executable code. michael@0: // To distinguish from next-state entries, these must be contiguous and michael@0: // all <= kExitNone michael@0: typedef enum { michael@0: kExitDstSpaceFull = 239, michael@0: kExitIllegalStructure, // 240 michael@0: kExitOK, // 241 michael@0: kExitReject, // ... michael@0: kExitReplace1, michael@0: kExitReplace2, michael@0: kExitReplace3, michael@0: kExitReplace21, michael@0: kExitReplace31, michael@0: kExitReplace32, michael@0: kExitReplaceOffset1, michael@0: kExitReplaceOffset2, michael@0: kExitReplace1S0, michael@0: kExitSpecial, michael@0: kExitDoAgain, michael@0: kExitRejectAlt, michael@0: kExitNone // 255 michael@0: } ExitReason; michael@0: michael@0: typedef enum { michael@0: kExitDstSpaceFull_2 = 32767, // 0x7fff michael@0: kExitIllegalStructure_2, // 32768 0x8000 michael@0: kExitOK_2, // 32769 0x8001 michael@0: kExitReject_2, // ... michael@0: kExitReplace1_2, michael@0: kExitReplace2_2, michael@0: kExitReplace3_2, michael@0: kExitReplace21_2, michael@0: kExitReplace31_2, michael@0: kExitReplace32_2, michael@0: kExitReplaceOffset1_2, michael@0: kExitReplaceOffset2_2, michael@0: kExitReplace1S0_2, michael@0: kExitSpecial_2, michael@0: kExitDoAgain_2, michael@0: kExitRejectAlt_2, michael@0: kExitNone_2 // 32783 0x800f michael@0: } ExitReason_2; michael@0: michael@0: michael@0: // This struct represents one entire state table. The three initialized byte michael@0: // areas are state_table, remap_base, and remap_string. state0 and state0_size michael@0: // give the byte offset and length within state_table of the initial state -- michael@0: // table lookups are expected to start and end in this state, but for michael@0: // truncated UTF-8 strings, may end in a different state. These allow a quick michael@0: // test for that condition. entry_shift is 8 for tables subscripted by a full michael@0: // byte value and 6 for space-optimized tables subscripted by only six michael@0: // significant bits in UTF-8 continuation bytes. michael@0: typedef struct { michael@0: const uint32 state0; michael@0: const uint32 state0_size; michael@0: const uint32 total_size; michael@0: const int max_expand; michael@0: const int entry_shift; michael@0: const int bytes_per_entry; michael@0: const uint32 losub; michael@0: const uint32 hiadd; michael@0: const uint8* state_table; michael@0: const RemapEntry* remap_base; michael@0: const uint8* remap_string; michael@0: const uint8* fast_state; michael@0: } UTF8StateMachineObj; michael@0: michael@0: // Near-duplicate declaration for tables with two-byte entries michael@0: typedef struct { michael@0: const uint32 state0; michael@0: const uint32 state0_size; michael@0: const uint32 total_size; michael@0: const int max_expand; michael@0: const int entry_shift; michael@0: const int bytes_per_entry; michael@0: const uint32 losub; michael@0: const uint32 hiadd; michael@0: const unsigned short* state_table; michael@0: const RemapEntry* remap_base; michael@0: const uint8* remap_string; michael@0: const uint8* fast_state; michael@0: } UTF8StateMachineObj_2; michael@0: michael@0: michael@0: typedef UTF8StateMachineObj UTF8PropObj; michael@0: typedef UTF8StateMachineObj UTF8ScanObj; michael@0: typedef UTF8StateMachineObj UTF8ReplaceObj; michael@0: typedef UTF8StateMachineObj_2 UTF8PropObj_2; michael@0: typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; michael@0: // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; michael@0: michael@0: michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericProperty(const UTF8PropObj* st, michael@0: const uint8** src, michael@0: int* srclen); michael@0: michael@0: // Look up property of one UTF-8 character (assumed to be valid). michael@0: // (This is a faster version of UTF8GenericProperty.) michael@0: bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); michael@0: michael@0: michael@0: // BigOneByte versions are needed for tables > 240 states, but most michael@0: // won't need the TwoByte versions. michael@0: michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, michael@0: const uint8** src, michael@0: int* srclen); michael@0: michael@0: michael@0: // TwoByte versions are needed for tables > 240 states that don't fit onto michael@0: // BigOneByte -- rare ultimate fallback michael@0: michael@0: // Look up property of one UTF-8 character (assumed to be valid). michael@0: // (This is a faster version of UTF8GenericProperty.) michael@0: bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); michael@0: michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, michael@0: const uint8** src, michael@0: int* srclen); michael@0: michael@0: // Look up property of one UTF-8 character (assumed to be valid). michael@0: // (This is a faster version of UTF8GenericProperty.) michael@0: bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); michael@0: michael@0: // Scan a UTF-8 stringpiece based on a state table. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes scanned. Return reason for exiting michael@0: int UTF8GenericScan(const UTF8ScanObj* st, michael@0: const StringPiece& str, michael@0: int* bytes_consumed); michael@0: michael@0: michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece michael@0: // and doing text replacements. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes consumed from input, number filled to output. michael@0: // Return reason for exiting michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one. michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap); michael@0: michael@0: // Older version without offsetmap michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed); michael@0: michael@0: // Older version without is_plain_text or offsetmap michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed); michael@0: michael@0: michael@0: // TwoByte version is needed for tables > about 256 states, such michael@0: // as the table for full Unicode 4.1 canonical + compatibility mapping michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table with two-byte entries, michael@0: // copying to output stringpiece michael@0: // and doing text replacements. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes consumed from input, number filled to output. michael@0: // Return reason for exiting michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one. michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap); michael@0: michael@0: // Older version without offsetmap michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed); michael@0: michael@0: // Older version without is_plain_text or offsetmap michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed); michael@0: michael@0: michael@0: static const unsigned char kUTF8LenTbl[256] = { michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, michael@0: 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 michael@0: }; michael@0: michael@0: inline int UTF8OneCharLen(const char* in) { michael@0: return kUTF8LenTbl[*reinterpret_cast(in)]; michael@0: } michael@0: michael@0: // Adjust a stringpiece to encompass complete UTF-8 characters. michael@0: // The data pointer will be increased by 0..3 bytes to get to a character michael@0: // boundary, and the length will then be decreased by 0..3 bytes michael@0: // to encompass the last complete character. michael@0: // This is useful especially when a UTF-8 string must be put into a fixed- michael@0: // maximum-size buffer cleanly, such as a MySQL buffer. michael@0: void UTF8TrimToChars(StringPiece* istr); michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: #endif // UTIL_UTF8_UTF8STATETABLE_H_