1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsCharSetProber.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,92 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsCharSetProber.h" 1.10 +#include "prmem.h" 1.11 + 1.12 +//This filter applies to all scripts which do not use English characters 1.13 +bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) 1.14 +{ 1.15 + char *newptr; 1.16 + char *prevPtr, *curPtr; 1.17 + 1.18 + bool meetMSB = false; 1.19 + newptr = *newBuf = (char*)PR_Malloc(aLen); 1.20 + if (!newptr) 1.21 + return false; 1.22 + 1.23 + for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 1.24 + { 1.25 + if (*curPtr & 0x80) 1.26 + { 1.27 + meetMSB = true; 1.28 + } 1.29 + else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 1.30 + { 1.31 + //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 1.32 + if (meetMSB && curPtr > prevPtr) 1.33 + //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 1.34 + { 1.35 + while (prevPtr < curPtr) *newptr++ = *prevPtr++; 1.36 + prevPtr++; 1.37 + *newptr++ = ' '; 1.38 + meetMSB = false; 1.39 + } 1.40 + else //ignore current segment. (either because it is just a symbol or just an English word) 1.41 + prevPtr = curPtr+1; 1.42 + } 1.43 + } 1.44 + if (meetMSB && curPtr > prevPtr) 1.45 + while (prevPtr < curPtr) *newptr++ = *prevPtr++; 1.46 + 1.47 + newLen = newptr - *newBuf; 1.48 + 1.49 + return true; 1.50 +} 1.51 + 1.52 +//This filter applies to all scripts which contain both English characters and upper ASCII characters. 1.53 +bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) 1.54 +{ 1.55 + //do filtering to reduce load to probers 1.56 + char *newptr; 1.57 + char *prevPtr, *curPtr; 1.58 + bool isInTag = false; 1.59 + 1.60 + newptr = *newBuf = (char*)PR_Malloc(aLen); 1.61 + if (!newptr) 1.62 + return false; 1.63 + 1.64 + for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 1.65 + { 1.66 + if (*curPtr == '>') 1.67 + isInTag = false; 1.68 + else if (*curPtr == '<') 1.69 + isInTag = true; 1.70 + 1.71 + if (!(*curPtr & 0x80) && 1.72 + (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 1.73 + { 1.74 + if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 1.75 + // and it is not inside a tag, keep it. 1.76 + { 1.77 + while (prevPtr < curPtr) *newptr++ = *prevPtr++; 1.78 + prevPtr++; 1.79 + *newptr++ = ' '; 1.80 + } 1.81 + else 1.82 + prevPtr = curPtr+1; 1.83 + } 1.84 + } 1.85 + 1.86 + // If the current segment contains more than just a symbol 1.87 + // and it is not inside a tag then keep it. 1.88 + if (!isInTag) 1.89 + while (prevPtr < curPtr) 1.90 + *newptr++ = *prevPtr++; 1.91 + 1.92 + newLen = newptr - *newBuf; 1.93 + 1.94 + return true; 1.95 +}