michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: michael@0: #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ michael@0: #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ michael@0: michael@0: #include "integral_types.h" michael@0: #include "langspan.h" michael@0: #include "offsetmap.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: static const int kMaxScriptBuffer = 40960; michael@0: static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; michael@0: static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room michael@0: static const int kWithinScriptTail = 32; // Stop at word space in last michael@0: // N bytes of script buffer michael@0: michael@0: michael@0: static inline bool IsContinuationByte(char c) { michael@0: return static_cast(c) < -64; michael@0: } michael@0: michael@0: // Gets lscript number for letters; always returns michael@0: // 0 (common script) for non-letters michael@0: int GetUTF8LetterScriptNum(const char* src); michael@0: michael@0: // Update src pointer to point to next quadgram, +2..+5 michael@0: // Looks at src[0..4] michael@0: const char* AdvanceQuad(const char* src); michael@0: michael@0: michael@0: class ScriptScanner { michael@0: public: michael@0: ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); michael@0: ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, michael@0: bool any_text, bool any_script); michael@0: ~ScriptScanner(); michael@0: michael@0: // Copy next run of same-script non-tag letters to buffer [NUL terminated] michael@0: bool GetOneScriptSpan(LangSpan* span); michael@0: michael@0: // Force Latin and Cyrillic scripts to be lowercase michael@0: void LowerScriptSpan(LangSpan* span); michael@0: michael@0: // Copy next run of same-script non-tag letters to buffer [NUL terminated] michael@0: // Force Latin and Cyrillic scripts to be lowercase michael@0: bool GetOneScriptSpanLower(LangSpan* span); michael@0: michael@0: // Copy next run of non-tag characters to buffer [NUL terminated] michael@0: // This just removes tags and removes entities michael@0: // Buffer has leading space michael@0: bool GetOneTextSpan(LangSpan* span); michael@0: michael@0: // Maps byte offset in most recent GetOneScriptSpan/Lower michael@0: // span->text [0..text_bytes] into an additional byte offset from michael@0: // span->offset, to get back to corresponding text in the original michael@0: // input buffer. michael@0: // text_offset must be the first byte michael@0: // of a UTF-8 character, or just beyond the last character. Normally this michael@0: // routine is called with the first byte of an interesting range and michael@0: // again with the first byte of the following range. michael@0: int MapBack(int text_offset); michael@0: michael@0: const char* GetBufferStart() {return start_byte_;}; michael@0: michael@0: private: michael@0: // Skip over tags and non-letters michael@0: int SkipToFrontOfSpan(const char* src, int len, int* script); michael@0: michael@0: const char* start_byte_; // Starting byte of buffer to scan michael@0: const char* next_byte_; // First unscanned byte michael@0: const char* next_byte_limit_; // Last byte + 1 michael@0: int byte_length_; // Bytes left: next_byte_limit_ - next_byte_ michael@0: michael@0: bool is_plain_text_; // true fo text, false for HTML michael@0: char* script_buffer_; // Holds text with expanded entities michael@0: char* script_buffer_lower_; // Holds lowercased text michael@0: bool letters_marks_only_; // To distinguish scriptspan of one michael@0: // letters/marks vs. any mixture of text michael@0: bool one_script_only_; // To distinguish scriptspan of one michael@0: // script vs. any mixture of scripts michael@0: int exit_state_; // For tag parser kTagParseTbl_0, based michael@0: // on letters_marks_only_ michael@0: public : michael@0: // Expose for debugging michael@0: OffsetMap map2original_; // map from script_buffer_ to buffer michael@0: OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ michael@0: }; michael@0: michael@0: } // namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ michael@0: