extensions/universalchardet/src/base/nsCharSetProber.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsCharSetProber.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,92 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 + 
     1.9 +#include "nsCharSetProber.h"
    1.10 +#include "prmem.h"
    1.11 +
    1.12 +//This filter applies to all scripts which do not use English characters
    1.13 +bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
    1.14 +{
    1.15 +  char *newptr;
    1.16 +  char *prevPtr, *curPtr;
    1.17 +  
    1.18 +  bool meetMSB = false;   
    1.19 +  newptr = *newBuf = (char*)PR_Malloc(aLen);
    1.20 +  if (!newptr)
    1.21 +    return false;
    1.22 +
    1.23 +  for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
    1.24 +  {
    1.25 +    if (*curPtr & 0x80)
    1.26 +    {
    1.27 +      meetMSB = true;
    1.28 +    }
    1.29 +    else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 
    1.30 +    {
    1.31 +      //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
    1.32 +      if (meetMSB && curPtr > prevPtr) 
    1.33 +      //this segment contains more than single symbol, and it has upper ASCII, we need to keep it
    1.34 +      {
    1.35 +        while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    1.36 +        prevPtr++;
    1.37 +        *newptr++ = ' ';
    1.38 +        meetMSB = false;
    1.39 +      }
    1.40 +      else //ignore current segment. (either because it is just a symbol or just an English word)
    1.41 +        prevPtr = curPtr+1;
    1.42 +    }
    1.43 +  }
    1.44 +  if (meetMSB && curPtr > prevPtr) 
    1.45 +    while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    1.46 +
    1.47 +  newLen = newptr - *newBuf;
    1.48 +
    1.49 +  return true;
    1.50 +}
    1.51 +
    1.52 +//This filter applies to all scripts which contain both English characters and upper ASCII characters.
    1.53 +bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
    1.54 +{
    1.55 +  //do filtering to reduce load to probers
    1.56 +  char *newptr;
    1.57 +  char *prevPtr, *curPtr;
    1.58 +  bool isInTag = false;
    1.59 +
    1.60 +  newptr = *newBuf = (char*)PR_Malloc(aLen);
    1.61 +  if (!newptr)
    1.62 +    return false;
    1.63 +
    1.64 +  for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
    1.65 +  {
    1.66 +    if (*curPtr == '>')
    1.67 +      isInTag = false;
    1.68 +    else if (*curPtr == '<')
    1.69 +      isInTag = true;
    1.70 +
    1.71 +    if (!(*curPtr & 0x80) &&
    1.72 +        (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
    1.73 +    {
    1.74 +      if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 
    1.75 +                                        // and it is not inside a tag, keep it.
    1.76 +      {
    1.77 +        while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
    1.78 +        prevPtr++;
    1.79 +        *newptr++ = ' ';
    1.80 +      }
    1.81 +      else
    1.82 +        prevPtr = curPtr+1;
    1.83 +    }
    1.84 +  }
    1.85 +
    1.86 +  // If the current segment contains more than just a symbol 
    1.87 +  // and it is not inside a tag then keep it.
    1.88 +  if (!isInTag)
    1.89 +    while (prevPtr < curPtr)
    1.90 +      *newptr++ = *prevPtr++;  
    1.91 +
    1.92 +  newLen = newptr - *newBuf;
    1.93 +
    1.94 +  return true;
    1.95 +}

mercurial