Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "nsCharSetProber.h" |
michael@0 | 7 | #include "prmem.h" |
michael@0 | 8 | |
michael@0 | 9 | //This filter applies to all scripts which do not use English characters |
michael@0 | 10 | bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) |
michael@0 | 11 | { |
michael@0 | 12 | char *newptr; |
michael@0 | 13 | char *prevPtr, *curPtr; |
michael@0 | 14 | |
michael@0 | 15 | bool meetMSB = false; |
michael@0 | 16 | newptr = *newBuf = (char*)PR_Malloc(aLen); |
michael@0 | 17 | if (!newptr) |
michael@0 | 18 | return false; |
michael@0 | 19 | |
michael@0 | 20 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) |
michael@0 | 21 | { |
michael@0 | 22 | if (*curPtr & 0x80) |
michael@0 | 23 | { |
michael@0 | 24 | meetMSB = true; |
michael@0 | 25 | } |
michael@0 | 26 | else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') |
michael@0 | 27 | { |
michael@0 | 28 | //current char is a symbol, most likely a punctuation. we treat it as segment delimiter |
michael@0 | 29 | if (meetMSB && curPtr > prevPtr) |
michael@0 | 30 | //this segment contains more than single symbol, and it has upper ASCII, we need to keep it |
michael@0 | 31 | { |
michael@0 | 32 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; |
michael@0 | 33 | prevPtr++; |
michael@0 | 34 | *newptr++ = ' '; |
michael@0 | 35 | meetMSB = false; |
michael@0 | 36 | } |
michael@0 | 37 | else //ignore current segment. (either because it is just a symbol or just an English word) |
michael@0 | 38 | prevPtr = curPtr+1; |
michael@0 | 39 | } |
michael@0 | 40 | } |
michael@0 | 41 | if (meetMSB && curPtr > prevPtr) |
michael@0 | 42 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; |
michael@0 | 43 | |
michael@0 | 44 | newLen = newptr - *newBuf; |
michael@0 | 45 | |
michael@0 | 46 | return true; |
michael@0 | 47 | } |
michael@0 | 48 | |
michael@0 | 49 | //This filter applies to all scripts which contain both English characters and upper ASCII characters. |
michael@0 | 50 | bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) |
michael@0 | 51 | { |
michael@0 | 52 | //do filtering to reduce load to probers |
michael@0 | 53 | char *newptr; |
michael@0 | 54 | char *prevPtr, *curPtr; |
michael@0 | 55 | bool isInTag = false; |
michael@0 | 56 | |
michael@0 | 57 | newptr = *newBuf = (char*)PR_Malloc(aLen); |
michael@0 | 58 | if (!newptr) |
michael@0 | 59 | return false; |
michael@0 | 60 | |
michael@0 | 61 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) |
michael@0 | 62 | { |
michael@0 | 63 | if (*curPtr == '>') |
michael@0 | 64 | isInTag = false; |
michael@0 | 65 | else if (*curPtr == '<') |
michael@0 | 66 | isInTag = true; |
michael@0 | 67 | |
michael@0 | 68 | if (!(*curPtr & 0x80) && |
michael@0 | 69 | (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) |
michael@0 | 70 | { |
michael@0 | 71 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol |
michael@0 | 72 | // and it is not inside a tag, keep it. |
michael@0 | 73 | { |
michael@0 | 74 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; |
michael@0 | 75 | prevPtr++; |
michael@0 | 76 | *newptr++ = ' '; |
michael@0 | 77 | } |
michael@0 | 78 | else |
michael@0 | 79 | prevPtr = curPtr+1; |
michael@0 | 80 | } |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | // If the current segment contains more than just a symbol |
michael@0 | 84 | // and it is not inside a tag then keep it. |
michael@0 | 85 | if (!isInTag) |
michael@0 | 86 | while (prevPtr < curPtr) |
michael@0 | 87 | *newptr++ = *prevPtr++; |
michael@0 | 88 | |
michael@0 | 89 | newLen = newptr - *newBuf; |
michael@0 | 90 | |
michael@0 | 91 | return true; |
michael@0 | 92 | } |