extensions/universalchardet/src/base/nsCharSetProber.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nsCharSetProber.h"
     7 #include "prmem.h"
     9 //This filter applies to all scripts which do not use English characters
    10 bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
    11 {
    12   char *newptr;
    13   char *prevPtr, *curPtr;
    15   bool meetMSB = false;   
    16   newptr = *newBuf = (char*)PR_Malloc(aLen);
    17   if (!newptr)
    18     return false;
    20   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
    21   {
    22     if (*curPtr & 0x80)
    23     {
    24       meetMSB = true;
    25     }
    26     else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 
    27     {
    28       //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
    29       if (meetMSB && curPtr > prevPtr) 
    30       //this segment contains more than single symbol, and it has upper ASCII, we need to keep it
    31       {
    32         while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    33         prevPtr++;
    34         *newptr++ = ' ';
    35         meetMSB = false;
    36       }
    37       else //ignore current segment. (either because it is just a symbol or just an English word)
    38         prevPtr = curPtr+1;
    39     }
    40   }
    41   if (meetMSB && curPtr > prevPtr) 
    42     while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    44   newLen = newptr - *newBuf;
    46   return true;
    47 }
    49 //This filter applies to all scripts which contain both English characters and upper ASCII characters.
    50 bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
    51 {
    52   //do filtering to reduce load to probers
    53   char *newptr;
    54   char *prevPtr, *curPtr;
    55   bool isInTag = false;
    57   newptr = *newBuf = (char*)PR_Malloc(aLen);
    58   if (!newptr)
    59     return false;
    61   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
    62   {
    63     if (*curPtr == '>')
    64       isInTag = false;
    65     else if (*curPtr == '<')
    66       isInTag = true;
    68     if (!(*curPtr & 0x80) &&
    69         (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
    70     {
    71       if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 
    72                                         // and it is not inside a tag, keep it.
    73       {
    74         while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    75         prevPtr++;
    76         *newptr++ = ' ';
    77       }
    78       else
    79         prevPtr = curPtr+1;
    80     }
    81   }
    83   // If the current segment contains more than just a symbol 
    84   // and it is not inside a tag then keep it.
    85   if (!isInTag)
    86     while (prevPtr < curPtr)
    87       *newptr++ = *prevPtr++;  
    89   newLen = newptr - *newBuf;
    91   return true;
    92 }

mercurial