Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | |
michael@0 | 20 | #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
michael@0 | 21 | #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
michael@0 | 22 | |
michael@0 | 23 | #include "integral_types.h" |
michael@0 | 24 | #include "langspan.h" |
michael@0 | 25 | #include "offsetmap.h" |
michael@0 | 26 | |
michael@0 | 27 | namespace CLD2 { |
michael@0 | 28 | |
michael@0 | 29 | static const int kMaxScriptBuffer = 40960; |
michael@0 | 30 | static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; |
michael@0 | 31 | static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room |
michael@0 | 32 | static const int kWithinScriptTail = 32; // Stop at word space in last |
michael@0 | 33 | // N bytes of script buffer |
michael@0 | 34 | |
michael@0 | 35 | |
michael@0 | 36 | static inline bool IsContinuationByte(char c) { |
michael@0 | 37 | return static_cast<signed char>(c) < -64; |
michael@0 | 38 | } |
michael@0 | 39 | |
michael@0 | 40 | // Gets lscript number for letters; always returns |
michael@0 | 41 | // 0 (common script) for non-letters |
michael@0 | 42 | int GetUTF8LetterScriptNum(const char* src); |
michael@0 | 43 | |
michael@0 | 44 | // Update src pointer to point to next quadgram, +2..+5 |
michael@0 | 45 | // Looks at src[0..4] |
michael@0 | 46 | const char* AdvanceQuad(const char* src); |
michael@0 | 47 | |
michael@0 | 48 | |
michael@0 | 49 | class ScriptScanner { |
michael@0 | 50 | public: |
michael@0 | 51 | ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); |
michael@0 | 52 | ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, |
michael@0 | 53 | bool any_text, bool any_script); |
michael@0 | 54 | ~ScriptScanner(); |
michael@0 | 55 | |
michael@0 | 56 | // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
michael@0 | 57 | bool GetOneScriptSpan(LangSpan* span); |
michael@0 | 58 | |
michael@0 | 59 | // Force Latin and Cyrillic scripts to be lowercase |
michael@0 | 60 | void LowerScriptSpan(LangSpan* span); |
michael@0 | 61 | |
michael@0 | 62 | // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
michael@0 | 63 | // Force Latin and Cyrillic scripts to be lowercase |
michael@0 | 64 | bool GetOneScriptSpanLower(LangSpan* span); |
michael@0 | 65 | |
michael@0 | 66 | // Copy next run of non-tag characters to buffer [NUL terminated] |
michael@0 | 67 | // This just removes tags and removes entities |
michael@0 | 68 | // Buffer has leading space |
michael@0 | 69 | bool GetOneTextSpan(LangSpan* span); |
michael@0 | 70 | |
michael@0 | 71 | // Maps byte offset in most recent GetOneScriptSpan/Lower |
michael@0 | 72 | // span->text [0..text_bytes] into an additional byte offset from |
michael@0 | 73 | // span->offset, to get back to corresponding text in the original |
michael@0 | 74 | // input buffer. |
michael@0 | 75 | // text_offset must be the first byte |
michael@0 | 76 | // of a UTF-8 character, or just beyond the last character. Normally this |
michael@0 | 77 | // routine is called with the first byte of an interesting range and |
michael@0 | 78 | // again with the first byte of the following range. |
michael@0 | 79 | int MapBack(int text_offset); |
michael@0 | 80 | |
michael@0 | 81 | const char* GetBufferStart() {return start_byte_;}; |
michael@0 | 82 | |
michael@0 | 83 | private: |
michael@0 | 84 | // Skip over tags and non-letters |
michael@0 | 85 | int SkipToFrontOfSpan(const char* src, int len, int* script); |
michael@0 | 86 | |
michael@0 | 87 | const char* start_byte_; // Starting byte of buffer to scan |
michael@0 | 88 | const char* next_byte_; // First unscanned byte |
michael@0 | 89 | const char* next_byte_limit_; // Last byte + 1 |
michael@0 | 90 | int byte_length_; // Bytes left: next_byte_limit_ - next_byte_ |
michael@0 | 91 | |
michael@0 | 92 | bool is_plain_text_; // true fo text, false for HTML |
michael@0 | 93 | char* script_buffer_; // Holds text with expanded entities |
michael@0 | 94 | char* script_buffer_lower_; // Holds lowercased text |
michael@0 | 95 | bool letters_marks_only_; // To distinguish scriptspan of one |
michael@0 | 96 | // letters/marks vs. any mixture of text |
michael@0 | 97 | bool one_script_only_; // To distinguish scriptspan of one |
michael@0 | 98 | // script vs. any mixture of scripts |
michael@0 | 99 | int exit_state_; // For tag parser kTagParseTbl_0, based |
michael@0 | 100 | // on letters_marks_only_ |
michael@0 | 101 | public : |
michael@0 | 102 | // Expose for debugging |
michael@0 | 103 | OffsetMap map2original_; // map from script_buffer_ to buffer |
michael@0 | 104 | OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ |
michael@0 | 105 | }; |
michael@0 | 106 | |
michael@0 | 107 | } // namespace CLD2 |
michael@0 | 108 | |
michael@0 | 109 | #endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
michael@0 | 110 |