michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: michael@0: #include "getonescriptspan.h" michael@0: #include michael@0: michael@0: #include "fixunicodevalue.h" michael@0: #include "lang_script.h" michael@0: #include "port.h" michael@0: #include "utf8statetable.h" michael@0: michael@0: #include "utf8prop_lettermarkscriptnum.h" michael@0: #include "utf8repl_lettermarklower.h" michael@0: #include "utf8scannot_lettermarkspecial.h" michael@0: michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Alphabetical order for binary search, from michael@0: // generated_entities.cc michael@0: extern const int kNameToEntitySize; michael@0: extern const CharIntPair kNameToEntity[]; michael@0: michael@0: static const int kMaxUpToWordBoundary = 50; // span < this make longer, michael@0: // else make shorter michael@0: static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes michael@0: // to round to word boundary, michael@0: // direction above michael@0: michael@0: static const char kSpecialSymbol[256] = { // true for < > & michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: michael@0: michael@0: #define LT 0 // < michael@0: #define GT 1 // > michael@0: #define EX 2 // ! michael@0: #define HY 3 // - michael@0: #define QU 4 // " michael@0: #define AP 5 // ' michael@0: #define SL 6 // / michael@0: #define S_ 7 michael@0: #define C_ 8 michael@0: #define R_ 9 michael@0: #define I_ 10 michael@0: #define P_ 11 michael@0: #define T_ 12 michael@0: #define Y_ 13 michael@0: #define L_ 14 michael@0: #define E_ 15 michael@0: #define CR 16 // or michael@0: #define NL 17 // non-letter: ASCII whitespace, digit, punctuation michael@0: #define PL 18 // possible letter, incl. & michael@0: #define xx 19 // michael@0: michael@0: // Map byte to one of ~20 interesting categories for cheap tag parsing michael@0: static const uint8 kCharToSub[256] = { michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, michael@0: NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, michael@0: michael@0: PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, michael@0: P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, michael@0: PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, michael@0: P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, michael@0: michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, michael@0: NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, michael@0: michael@0: PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, michael@0: PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, michael@0: PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, michael@0: PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, michael@0: }; michael@0: michael@0: #undef LT michael@0: #undef GT michael@0: #undef EX michael@0: #undef HY michael@0: #undef QU michael@0: #undef AP michael@0: #undef SL michael@0: #undef S_ michael@0: #undef C_ michael@0: #undef R_ michael@0: #undef I_ michael@0: #undef P_ michael@0: #undef T_ michael@0: #undef Y_ michael@0: #undef L_ michael@0: #undef E_ michael@0: #undef CR michael@0: #undef NL michael@0: #undef PL michael@0: #undef xx michael@0: michael@0: michael@0: #define OK 0 michael@0: #define X_ 1 michael@0: michael@0: michael@0: static const int kMaxExitStateLettersMarksOnly = 1; michael@0: static const int kMaxExitStateAllText = 2; michael@0: michael@0: michael@0: // State machine to do cheap parse of non-letter strings incl. tags michael@0: // advances michael@0: // | | michael@0: // advances ... for