The Tor Browser: netwerk/streamconv/converters/mozTXTToHTMLConv.cpp@deefc01c0e14 (annotated)

netwerk/streamconv/converters/mozTXTToHTMLConv.cpp@deefc01c0e14 (annotated)

netwerk/streamconv/converters/mozTXTToHTMLConv.cpp

Thu, 15 Jan 2015 21:03:48 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Thu, 15 Jan 2015 21:03:48 +0100
branch: TOR_BUG_9701
changeset 11: deefc01c0e14
permissions: -rw-r--r--

Integrate friendly tips from Tor colleagues to make (or not) 4.5 alpha 3;
This includes removal of overloaded (but unused) methods, and addition of
a overlooked call to DataStruct::SetData(nsISupports, uint32_t, bool.)

 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 #include "mozTXTToHTMLConv.h"
 #include "nsNetUtil.h"
 #include "nsUnicharUtils.h"
 #include "nsCRT.h"
 #include "nsIExternalProtocolHandler.h"
 #include "nsIIOService.h"
 #include <algorithm>
 #ifdef DEBUG_BenB_Perf
 #include "prtime.h"
 #include "prinrval.h"
 #endif
 const double growthRate = 1.2;
 // Bug 183111, editor now replaces multiple spaces with leading
 // 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
 // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
 // Also recognize the Japanese ideographic space 0x3000 as a space.
 static inline bool IsSpace(const char16_t aChar)
 {
   return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
 }
 // Escape Char will take ch, escape it and append the result to
 // aStringToAppendTo
 void
 mozTXTToHTMLConv::EscapeChar(const char16_t ch, nsString& aStringToAppendTo,
                              bool inAttribute)
 {
     switch (ch)
     {
     case '<':
       aStringToAppendTo.AppendLiteral("&lt;");
       break;
     case '>':
       aStringToAppendTo.AppendLiteral("&gt;");
       break;
     case '&':
       aStringToAppendTo.AppendLiteral("&amp;");
       break;
     case '"':
       if (inAttribute)
       {
         aStringToAppendTo.AppendLiteral("&quot;");
         break;
       }
       // else fall through
     default:
       aStringToAppendTo += ch;
     }
     return;
 }
 // EscapeStr takes the passed in string and
 // escapes it IN PLACE.
 void
 mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute)
 {
   // the replace substring routines
   // don't seem to work if you have a character
   // in the in string that is also in the replacement
   // string! =(
   //aInString.ReplaceSubstring("&", "&amp;");
   //aInString.ReplaceSubstring("<", "&lt;");
   //aInString.ReplaceSubstring(">", "&gt;");
   for (uint32_t i = 0; i < aInString.Length();)
   {
     switch (aInString[i])
     {
     case '<':
       aInString.Cut(i, 1);
       aInString.Insert(NS_LITERAL_STRING("&lt;"), i);
       i += 4; // skip past the integers we just added
       break;
     case '>':
       aInString.Cut(i, 1);
       aInString.Insert(NS_LITERAL_STRING("&gt;"), i);
       i += 4; // skip past the integers we just added
       break;
     case '&':
       aInString.Cut(i, 1);
       aInString.Insert(NS_LITERAL_STRING("&amp;"), i);
       i += 5; // skip past the integers we just added
       break;
     case '"':
       if (inAttribute)
       {
         aInString.Cut(i, 1);
         aInString.Insert(NS_LITERAL_STRING("&quot;"), i);
         i += 6;
         break;
       }
       // else fall through
     default:
       i++;
     }
   }
 }
 void
 mozTXTToHTMLConv::UnescapeStr(const char16_t * aInString, int32_t aStartPos, int32_t aLength, nsString& aOutString)
 {
   const char16_t * subString = nullptr;
   for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;)
   {
     int32_t remainingChars = i - aStartPos;
     if (aInString[i] == '&')
     {
       subString = &aInString[i];
       if (!nsCRT::strncmp(subString, MOZ_UTF16("&lt;"), std::min(4, aLength - remainingChars)))
       {
         aOutString.Append(char16_t('<'));
         i += 4;
       }
       else if (!nsCRT::strncmp(subString, MOZ_UTF16("&gt;"), std::min(4, aLength - remainingChars)))
       {
         aOutString.Append(char16_t('>'));
         i += 4;
       }
       else if (!nsCRT::strncmp(subString, MOZ_UTF16("&amp;"), std::min(5, aLength - remainingChars)))
       {
         aOutString.Append(char16_t('&'));
         i += 5;
       }
       else if (!nsCRT::strncmp(subString, MOZ_UTF16("&quot;"), std::min(6, aLength - remainingChars)))
       {
         aOutString.Append(char16_t('"'));
         i += 6;
       }
       else
       {
         aOutString += aInString[i];
         i++;
       }
     }
     else
     {
       aOutString += aInString[i];
       i++;
     }
   }
 }
 void
 mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t * aInString, int32_t aInLength,
                                          const uint32_t pos, nsString& aOutString)
 {
   NS_ASSERTION(int32_t(pos) < aInLength, "bad args to CompleteAbbreviatedURL, see bug #190851");
   if (int32_t(pos) >= aInLength)
     return;
   if (aInString[pos] == '@')
   {
     // only pre-pend a mailto url if the string contains a .domain in it..
     //i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
     nsDependentString inString(aInString, aInLength);
     if (inString.FindChar('.', pos) != kNotFound) // if we have a '.' after the @ sign....
     {
       aOutString.AssignLiteral("mailto:");
       aOutString += aInString;
     }
   }
   else if (aInString[pos] == '.')
   {
     if (ItMatchesDelimited(aInString, aInLength,
                            MOZ_UTF16("www."), 4, LT_IGNORE, LT_IGNORE))
     {
       aOutString.AssignLiteral("http://");
       aOutString += aInString;
     }
     else if (ItMatchesDelimited(aInString,aInLength, MOZ_UTF16("ftp."), 4, LT_IGNORE, LT_IGNORE))
     {
       aOutString.AssignLiteral("ftp://");
       aOutString += aInString;
     }
   }
 }
 bool
 mozTXTToHTMLConv::FindURLStart(const char16_t * aInString, int32_t aInLength,
                                const uint32_t pos, const modetype check,
                                uint32_t& start)
 {
   switch(check)
   { // no breaks, because end of blocks is never reached
   case RFC1738:
   {
     if (!nsCRT::strncmp(&aInString[std::max(int32_t(pos - 4), 0)], MOZ_UTF16("<URL:"), 5))
     {
       start = pos + 1;
       return true;
     }
     else
       return false;
   }
   case RFC2396E:
   {
     nsString temp(aInString, aInLength);
     int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(MOZ_UTF16("<>\""), pos - 1);
     if (i != kNotFound && (temp[uint32_t(i)] == '<' ||
                            temp[uint32_t(i)] == '"'))
     {
       start = uint32_t(++i);
       return start < pos;
     }
     else
       return false;
   }
   case freetext:
   {
     int32_t i = pos - 1;
     for (; i >= 0 && (
          nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
          nsCRT::IsAsciiDigit(aInString[uint32_t(i)]) ||
          aInString[uint32_t(i)] == '+' ||
          aInString[uint32_t(i)] == '-' ||
          aInString[uint32_t(i)] == '.'
          ); i--)
       ;
     if (++i >= 0 && uint32_t(i) < pos && nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]))
     {
       start = uint32_t(i);
       return true;
     }
     else
       return false;
   }
   case abbreviated:
   {
     int32_t i = pos - 1;
     // This disallows non-ascii-characters for email.
     // Currently correct, but revisit later after standards changed.
     bool isEmail = aInString[pos] == (char16_t)'@';
     // These chars mark the start of the URL
     for (; i >= 0
              && aInString[uint32_t(i)] != '>' && aInString[uint32_t(i)] != '<'
              && aInString[uint32_t(i)] != '"' && aInString[uint32_t(i)] != '\''
              && aInString[uint32_t(i)] != '`' && aInString[uint32_t(i)] != ','
              && aInString[uint32_t(i)] != '{' && aInString[uint32_t(i)] != '['
              && aInString[uint32_t(i)] != '(' && aInString[uint32_t(i)] != '|'
              && aInString[uint32_t(i)] != '\\'
              && !IsSpace(aInString[uint32_t(i)])
              && (!isEmail || nsCRT::IsAscii(aInString[uint32_t(i)]))
          ; i--)
       ;
     if
       (
         ++i >= 0 && uint32_t(i) < pos
           &&
           (
             nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
             nsCRT::IsAsciiDigit(aInString[uint32_t(i)])
           )
       )
     {
       start = uint32_t(i);
       return true;
     }
     else
       return false;
   }
   default:
     return false;
   } //switch
 }
 bool
 mozTXTToHTMLConv::FindURLEnd(const char16_t * aInString, int32_t aInStringLength, const uint32_t pos,
            const modetype check, const uint32_t start, uint32_t& end)
 {
   switch(check)
   { // no breaks, because end of blocks is never reached
   case RFC1738:
   case RFC2396E:
   {
     nsString temp(aInString, aInStringLength);
     int32_t i = temp.FindCharInSet(MOZ_UTF16("<>\""), pos + 1);
     if (i != kNotFound && temp[uint32_t(i--)] ==
         (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"'))
     {
       end = uint32_t(i);
       return end > pos;
     }
     return false;
   }
   case freetext:
   case abbreviated:
   {
     uint32_t i = pos + 1;
     bool isEmail = aInString[pos] == (char16_t)'@';
     bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL
     bool seenOpeningSquareBracket = false; // there is a '[' earlier in the URL
     for (; int32_t(i) < aInStringLength; i++)
     {
       // These chars mark the end of the URL
       if (aInString[i] == '>' || aInString[i] == '<' ||
           aInString[i] == '"' || aInString[i] == '`' ||
           aInString[i] == '}' || aInString[i] == '{' ||
           aInString[i] == '|' ||
           (aInString[i] == ')' && !seenOpeningParenthesis) ||
           (aInString[i] == ']' && !seenOpeningSquareBracket) ||
           // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
           (aInString[i] == '[' && i > 2 &&
            (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
           IsSpace(aInString[i]))
           break;
       // Disallow non-ascii-characters for email.
       // Currently correct, but revisit later after standards changed.
       if (isEmail && (
             aInString[i] == '(' || aInString[i] == '\'' ||
             !nsCRT::IsAscii(aInString[i])))
           break;
       if (aInString[i] == '(')
         seenOpeningParenthesis = true;
       if (aInString[i] == '[')
         seenOpeningSquareBracket = true;
     }
     // These chars are allowed in the middle of the URL, but not at end.
     // Technically they are, but are used in normal text after the URL.
     while (--i > pos && (
              aInString[i] == '.' || aInString[i] == ',' || aInString[i] == ';' ||
              aInString[i] == '!' || aInString[i] == '?' || aInString[i] == '-' ||
              aInString[i] == ':' || aInString[i] == '\''
              ))
         ;
     if (i > pos)
     {
       end = i;
       return true;
     }
     return false;
   }
   default:
     return false;
   } //switch
 }
 void
 mozTXTToHTMLConv::CalculateURLBoundaries(const char16_t * aInString, int32_t aInStringLength,
      const uint32_t pos, const uint32_t whathasbeendone,
      const modetype check, const uint32_t start, const uint32_t end,
      nsString& txtURL, nsString& desc,
      int32_t& replaceBefore, int32_t& replaceAfter)
 {
   uint32_t descstart = start;
   switch(check)
   {
   case RFC1738:
   {
     descstart = start - 5;
     desc.Append(&aInString[descstart], end - descstart + 2);  // include "<URL:" and ">"
     replaceAfter = end - pos + 1;
   } break;
   case RFC2396E:
   {
     descstart = start - 1;
     desc.Append(&aInString[descstart], end - descstart + 2); // include brackets
     replaceAfter = end - pos + 1;
   } break;
   case freetext:
   case abbreviated:
   {
     descstart = start;
     desc.Append(&aInString[descstart], end - start + 1); // don't include brackets
     replaceAfter = end - pos;
   } break;
   default: break;
   } //switch
   EscapeStr(desc, false);
   txtURL.Append(&aInString[start], end - start + 1);
   txtURL.StripWhitespace();
   // FIX ME
   nsAutoString temp2;
   ScanTXT(&aInString[descstart], pos - descstart, ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
   replaceBefore = temp2.Length();
   return;
 }
 bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL)
 {
   if (!mIOService)
     return false;
   nsAutoCString scheme;
   nsresult rv = mIOService->ExtractScheme(aURL, scheme);
   if(NS_FAILED(rv))
     return false;
   // Get the handler for this scheme.
   nsCOMPtr<nsIProtocolHandler> handler;
   rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
   if(NS_FAILED(rv))
     return false;
   // Is it an external protocol handler? If not, linkify it.
   nsCOMPtr<nsIExternalProtocolHandler> externalHandler = do_QueryInterface(handler);
   if (!externalHandler)
    return true; // handler is built-in, linkify it!
   // If external app exists for the scheme then linkify it.
   bool exists;
   rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
   return(NS_SUCCEEDED(rv) && exists);
 }
 bool
 mozTXTToHTMLConv::CheckURLAndCreateHTML(
      const nsString& txtURL, const nsString& desc, const modetype mode,
      nsString& outputHTML)
 {
   // Create *uri from txtURL
   nsCOMPtr<nsIURI> uri;
   nsresult rv;
   // Lazily initialize mIOService
   if (!mIOService)
   {
     mIOService = do_GetIOService();
     if (!mIOService)
       return false;
   }
   // See if the url should be linkified.
   NS_ConvertUTF16toUTF8 utf8URL(txtURL);
   if (!ShouldLinkify(utf8URL))
     return false;
   // it would be faster if we could just check to see if there is a protocol
   // handler for the url and return instead of actually trying to create a url...
   rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
   // Real work
   if (NS_SUCCEEDED(rv) && uri)
   {
     outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
     switch(mode)
     {
     case RFC1738:
       outputHTML.AppendLiteral("rfc1738");
       break;
     case RFC2396E:
       outputHTML.AppendLiteral("rfc2396E");
       break;
     case freetext:
       outputHTML.AppendLiteral("freetext");
       break;
     case abbreviated:
       outputHTML.AppendLiteral("abbreviated");
       break;
     default: break;
     }
     nsAutoString escapedURL(txtURL);
     EscapeStr(escapedURL, true);
     outputHTML.AppendLiteral("\" href=\"");
     outputHTML += escapedURL;
     outputHTML.AppendLiteral("\">");
     outputHTML += desc;
     outputHTML.AppendLiteral("</a>");
     return true;
   }
   else
     return false;
 }
 NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t * aInString, int32_t aInLength, int32_t aPos, int32_t * aStartPos, int32_t * aEndPos)
 {
   // call FindURL on the passed in string
   nsAutoString outputHTML; // we'll ignore the generated output HTML
   *aStartPos = -1;
   *aEndPos = -1;
   FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
   return NS_OK;
 }
 bool
 mozTXTToHTMLConv::FindURL(const char16_t * aInString, int32_t aInLength, const uint32_t pos,
      const uint32_t whathasbeendone,
      nsString& outputHTML, int32_t& replaceBefore, int32_t& replaceAfter)
 {
   enum statetype {unchecked, invalid, startok, endok, success};
   static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
   statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
   /* I don't like this abuse of enums as index for the array,
      but I don't know a better method */
   // Define, which modes to check
   /* all modes but abbreviated are checked for text[pos] == ':',
      only abbreviated for '.', RFC2396E and abbreviated for '@' */
   for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
        iState = modetype(iState + 1))
     state[iState] = aInString[pos] == ':' ? unchecked : invalid;
   switch (aInString[pos])
   {
   case '@':
     state[RFC2396E] = unchecked;
     // no break here
   case '.':
     state[abbreviated] = unchecked;
     break;
   case ':':
     state[abbreviated] = invalid;
     break;
   default:
     break;
   }
   // Test, first successful mode wins, sequence defined by |ranking|
   int32_t iCheck = 0;  // the currently tested modetype
   modetype check = ranking[iCheck];
   for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
        iCheck++)
     /* check state from last run.
        If this is the first, check this one, which isn't = success yet */
   {
     check = ranking[iCheck];
     uint32_t start, end;
     if (state[check] == unchecked)
       if (FindURLStart(aInString, aInLength, pos, check, start))
         state[check] = startok;
     if (state[check] == startok)
       if (FindURLEnd(aInString, aInLength, pos, check, start, end))
         state[check] = endok;
     if (state[check] == endok)
     {
       nsAutoString txtURL, desc;
       int32_t resultReplaceBefore, resultReplaceAfter;
       CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check, start, end,
                              txtURL, desc,
                              resultReplaceBefore, resultReplaceAfter);
       if (aInString[pos] != ':')
       {
         nsAutoString temp = txtURL;
         txtURL.SetLength(0);
         CompleteAbbreviatedURL(temp.get(),temp.Length(), pos - start, txtURL);
       }
       if (!txtURL.IsEmpty() && CheckURLAndCreateHTML(txtURL, desc, check,
                                                      outputHTML))
       {
         replaceBefore = resultReplaceBefore;
         replaceAfter = resultReplaceAfter;
         state[check] = success;
       }
     } // if
   } // for
   return state[check] == success;
 }
 bool
 mozTXTToHTMLConv::ItMatchesDelimited(const char16_t * aInString,
     int32_t aInLength, const char16_t* rep, int32_t aRepLen,
     LIMTYPE before, LIMTYPE after)
 {
   // this little method gets called a LOT. I found we were spending a
   // lot of time just calculating the length of the variable "rep"
   // over and over again every time we called it. So we're now passing
   // an integer in here.
   int32_t textLen = aInLength;
   if
     (
       ((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER))
         && textLen < aRepLen) ||
       ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER))
         && textLen < aRepLen + 1) ||
       (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER
         && textLen < aRepLen + 2)
     )
     return false;
   char16_t text0 = aInString[0];
   char16_t textAfterPos = aInString[aRepLen + (before == LT_IGNORE ? 0 : 1)];
   if
     (
       (before == LT_ALPHA
         && !nsCRT::IsAsciiAlpha(text0)) ||
       (before == LT_DIGIT
         && !nsCRT::IsAsciiDigit(text0)) ||
       (before == LT_DELIMITER
         &&
         (
           nsCRT::IsAsciiAlpha(text0) ||
           nsCRT::IsAsciiDigit(text0) ||
           text0 == *rep
         )) ||
       (after == LT_ALPHA
         && !nsCRT::IsAsciiAlpha(textAfterPos)) ||
       (after == LT_DIGIT
         && !nsCRT::IsAsciiDigit(textAfterPos)) ||
       (after == LT_DELIMITER
         &&
         (
           nsCRT::IsAsciiAlpha(textAfterPos) ||
           nsCRT::IsAsciiDigit(textAfterPos) ||
           textAfterPos == *rep
         )) ||
         !Substring(Substring(aInString, aInString+aInLength),
                    (before == LT_IGNORE ? 0 : 1),
                    aRepLen).Equals(Substring(rep, rep+aRepLen),
                                    nsCaseInsensitiveStringComparator())
     )
     return false;
   return true;
 }
 uint32_t
 mozTXTToHTMLConv::NumberOfMatches(const char16_t * aInString, int32_t aInStringLength,
      const char16_t* rep, int32_t aRepLen, LIMTYPE before, LIMTYPE after)
 {
   uint32_t result = 0;
   for (int32_t i = 0; i < aInStringLength; i++)
   {
     const char16_t * indexIntoString = &aInString[i];
     if (ItMatchesDelimited(indexIntoString, aInStringLength - i, rep, aRepLen, before, after))
       result++;
   }
   return result;
 }
 // NOTE: the converted html for the phrase is appended to aOutString
 // tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
 bool
 mozTXTToHTMLConv::StructPhraseHit(const char16_t * aInString, int32_t aInStringLength, bool col0,
      const char16_t* tagTXT, int32_t aTagTXTLen,
      const char* tagHTML, const char* attributeHTML,
      nsString& aOutString, uint32_t& openTags)
 {
   /* We're searching for the following pattern:
      LT_DELIMITER - "*" - ALPHA -
      [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
      <strong> is only inserted, if existence of a pair could be verified
      We use the first opening/closing tag, if we can choose */
   const char16_t * newOffset = aInString;
   int32_t newLength = aInStringLength;
   if (!col0) // skip the first element?
   {
     newOffset = &aInString[1];
     newLength = aInStringLength - 1;
   }
   // opening tag
   if
     (
       ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
            (col0 ? LT_IGNORE : LT_DELIMITER), LT_ALPHA) // is opening tag
         && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen,
               LT_ALPHA, LT_DELIMITER)  // remaining closing tags
               > openTags
     )
   {
     openTags++;
     aOutString.AppendLiteral("<");
     aOutString.AppendASCII(tagHTML);
     aOutString.Append(char16_t(' '));
     aOutString.AppendASCII(attributeHTML);
     aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
     aOutString.Append(tagTXT);
     aOutString.AppendLiteral("</span>");
     return true;
   }
   // closing tag
   else if (openTags > 0
        && ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen, LT_ALPHA, LT_DELIMITER))
   {
     openTags--;
     aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
     aOutString.Append(tagTXT);
     aOutString.AppendLiteral("</span></");
     aOutString.AppendASCII(tagHTML);
     aOutString.Append(char16_t('>'));
     return true;
   }
   return false;
 }
 bool
 mozTXTToHTMLConv::SmilyHit(const char16_t * aInString, int32_t aLength, bool col0,
          const char* tagTXT, const char* imageName,
          nsString& outputHTML, int32_t& glyphTextLen)
 {
   if ( !aInString || !tagTXT || !imageName )
       return false;
   int32_t tagLen = strlen(tagTXT);
   uint32_t delim = (col0 ? 0 : 1) + tagLen;
   if
     (
       (col0 || IsSpace(aInString[0]))
         &&
         (
           aLength <= int32_t(delim) ||
           IsSpace(aInString[delim]) ||
           (aLength > int32_t(delim + 1)
             &&
             (
               aInString[delim] == '.' ||
               aInString[delim] == ',' ||
               aInString[delim] == ';' ||
               aInString[delim] == '8' ||
               aInString[delim] == '>' ||
               aInString[delim] == '!' ||
               aInString[delim] == '?'
             )
             && IsSpace(aInString[delim + 1]))
         )
         && ItMatchesDelimited(aInString, aLength, NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
                               col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
 	        // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
     )
   {
     if (!col0)
     {
       outputHTML.Truncate();
       outputHTML.Append(char16_t(' '));
     }
     outputHTML.AppendLiteral("<span class=\""); // <span class="
     AppendASCIItoUTF16(imageName, outputHTML);  // e.g. smiley-frown
     outputHTML.AppendLiteral("\" title=\"");    // " title="
     AppendASCIItoUTF16(tagTXT, outputHTML);     // smiley tooltip
     outputHTML.AppendLiteral("\"><span>");      // "><span>
     AppendASCIItoUTF16(tagTXT, outputHTML);     // original text
     outputHTML.AppendLiteral("</span></span>"); // </span></span>
     glyphTextLen = (col0 ? 0 : 1) + tagLen;
     return true;
   }
   return false;
 }
 // the glyph is appended to aOutputString instead of the original string...
 bool
 mozTXTToHTMLConv::GlyphHit(const char16_t * aInString, int32_t aInLength, bool col0,
          nsString& aOutputString, int32_t& glyphTextLen)
 {
   char16_t text0 = aInString[0];
   char16_t text1 = aInString[1];
   char16_t firstChar = (col0 ? text0 : text1);
   // temporary variable used to store the glyph html text
   nsAutoString outputHTML;
   bool bTestSmilie;
   bool bArg = false;
   int i;
   // refactor some of this mess to avoid code duplication and speed execution a bit
   // there are two cases that need to be tried one after another. To avoid a lot of
   // duplicate code, rolling into a loop
   i = 0;
   while ( i < 2 )
   {
     bTestSmilie = false;
     if ( !i && (firstChar == ':' || firstChar == ';' || firstChar == '=' || firstChar == '>' || firstChar == '8' || firstChar == 'O'))
     {
         // first test passed
         bTestSmilie = true;
         bArg = col0;
     }
     if ( i && col0 && ( text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' || text1 == '8' || text1 == 'O' ) )
     {
         // second test passed
         bTestSmilie = true;
         bArg = false;
     }
     if ( bTestSmilie && (
           SmilyHit(aInString, aInLength, bArg,
                    ":-)",
                    "moz-smiley-s1", // smile
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":)",
                    "moz-smiley-s1", // smile
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-D",
                    "moz-smiley-s5", // laughing
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-(",
                    "moz-smiley-s2", // frown
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":(",
                    "moz-smiley-s2", // frown
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-[",
                    "moz-smiley-s6", // embarassed
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ";-)",
                    "moz-smiley-s3", // wink
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, col0,
                    ";)",
                    "moz-smiley-s3", // wink
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-\\",
                    "moz-smiley-s7", // undecided
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-P",
                    "moz-smiley-s4", // tongue
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ";-P",
                    "moz-smiley-s4", // tongue
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    "=-O",
                    "moz-smiley-s8", // surprise
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-*",
                    "moz-smiley-s9", // kiss
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ">:o",
                    "moz-smiley-s10", // yell
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ">:-o",
                    "moz-smiley-s10", // yell
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    "8-)",
                    "moz-smiley-s11", // cool
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-$",
                    "moz-smiley-s12", // money
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-!",
                    "moz-smiley-s13", // foot
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    "O:-)",
                    "moz-smiley-s14", // innocent
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":'(",
                    "moz-smiley-s15", // cry
                    outputHTML, glyphTextLen) ||
           SmilyHit(aInString, aInLength, bArg,
                    ":-X",
                    "moz-smiley-s16", // sealed
                    outputHTML, glyphTextLen)
         )
     )
     {
         aOutputString.Append(outputHTML);
         return true;
     }
     i++;
   }
   if (text0 == '\f')
   {
       aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
       glyphTextLen = 1;
       return true;
   }
   if (text0 == '+' || text1 == '+')
   {
     if (ItMatchesDelimited(aInString, aInLength,
                            MOZ_UTF16(" +/-"), 4,
                            LT_IGNORE, LT_IGNORE))
     {
       aOutputString.AppendLiteral(" &plusmn;");
       glyphTextLen = 4;
       return true;
     }
     if (col0 && ItMatchesDelimited(aInString, aInLength,
                                    MOZ_UTF16("+/-"), 3,
                                    LT_IGNORE, LT_IGNORE))
     {
       aOutputString.AppendLiteral("&plusmn;");
       glyphTextLen = 3;
       return true;
     }
   }
   // x^2  =>  x<sup>2</sup>,   also handle powers x^-2,  x^0.5
   // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
   if
     (
       text1 == '^'
       &&
       (
         nsCRT::IsAsciiDigit(text0) || nsCRT::IsAsciiAlpha(text0) ||
         text0 == ')' || text0 == ']' || text0 == '}'
       )
       &&
       (
         (2 < aInLength && nsCRT::IsAsciiDigit(aInString[2])) ||
         (3 < aInLength && aInString[2] == '-' && nsCRT::IsAsciiDigit(aInString[3]))
       )
     )
   {
     // Find first non-digit
     int32_t delimPos = 3;  // skip "^" and first digit (or '-')
     for (; delimPos < aInLength
            &&
            (
              nsCRT::IsAsciiDigit(aInString[delimPos]) ||
              (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
                nsCRT::IsAsciiDigit(aInString[delimPos + 1]))
            );
          delimPos++)
       ;
     if (delimPos < aInLength && nsCRT::IsAsciiAlpha(aInString[delimPos]))
     {
       return false;
     }
     outputHTML.Truncate();
     outputHTML += text0;
     outputHTML.AppendLiteral(
       "<sup class=\"moz-txt-sup\">"
       "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
       "^</span>");
     aOutputString.Append(outputHTML);
     aOutputString.Append(&aInString[2], delimPos - 2);
     aOutputString.AppendLiteral("</sup>");
     glyphTextLen = delimPos /* - 1 + 1 */ ;
     return true;
   }
   /*
    The following strings are not substituted:
    |TXT   |HTML     |Reason
    +------+---------+----------
     ->     &larr;    Bug #454
     =>     &lArr;    dito
     <-     &rarr;    dito
     <=     &rArr;    dito
     (tm)   &trade;   dito
 /4    &frac14;  is triggered by 1/4 Part 1, 2/4 Part 2, ...
 /4    &frac34;  dito
 /2    &frac12;  similar
   */
   return false;
 }
 /***************************************************************************
   Library-internal Interface
 ****************************************************************************/
 mozTXTToHTMLConv::mozTXTToHTMLConv()
 {
 }
 mozTXTToHTMLConv::~mozTXTToHTMLConv()
 {
 }
 NS_IMPL_ISUPPORTS(mozTXTToHTMLConv,
                   mozITXTToHTMLConv,
                   nsIStreamConverter,
                   nsIStreamListener,
                   nsIRequestObserver)
 int32_t
 mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line,
 				    uint32_t& logLineStart)
 {
   int32_t result = 0;
   int32_t lineLength = NS_strlen(line);
   bool moreCites = true;
   while (moreCites)
   {
     /* E.g. the following lines count as quote:
        > text
        //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
        >text
        //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
            > text
        ] text
        USER> text
        USER] text
        //#endif
        logLineStart is the position of "t" in this example
     */
     uint32_t i = logLineStart;
 #ifdef QUOTE_RECOGNITION_AGGRESSIVE
     for (; int32_t(i) < lineLength && IsSpace(line[i]); i++)
       ;
     for (; int32_t(i) < lineLength && nsCRT::IsAsciiAlpha(line[i])
                                    && nsCRT::IsUpper(line[i])   ; i++)
       ;
     if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
 #else
     if (int32_t(i) < lineLength && line[i] == '>')
 #endif
     {
       i++;
       if (int32_t(i) < lineLength && line[i] == ' ')
         i++;
       // sendmail/mbox
       // Placed here for performance increase
       const char16_t * indexString = &line[logLineStart];
            // here, |logLineStart < lineLength| is always true
       uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
       if (Substring(indexString,
                     indexString+minlength).Equals(Substring(NS_LITERAL_STRING(">From "), 0, minlength),
                                                   nsCaseInsensitiveStringComparator()))
         //XXX RFC2646
         moreCites = false;
       else
       {
         result++;
         logLineStart = i;
       }
     }
     else
       moreCites = false;
   }
   return result;
 }
 void
 mozTXTToHTMLConv::ScanTXT(const char16_t * aInString, int32_t aInStringLength, uint32_t whattodo, nsString& aOutString)
 {
   bool doURLs = 0 != (whattodo & kURLs);
   bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
   bool doStructPhrase = 0 != (whattodo & kStructPhrase);
   uint32_t structPhrase_strong = 0;  // Number of currently open tags
   uint32_t structPhrase_underline = 0;
   uint32_t structPhrase_italic = 0;
   uint32_t structPhrase_code = 0;
   nsAutoString outputHTML;  // moved here for performance increase
   for(uint32_t i = 0; int32_t(i) < aInStringLength;)
   {
     if (doGlyphSubstitution)
     {
       int32_t glyphTextLen;
       if (GlyphHit(&aInString[i], aInStringLength - i, i == 0, aOutString, glyphTextLen))
       {
         i += glyphTextLen;
         continue;
       }
     }
     if (doStructPhrase)
     {
       const char16_t * newOffset = aInString;
       int32_t newLength = aInStringLength;
       if (i > 0 ) // skip the first element?
       {
         newOffset = &aInString[i-1];
         newLength = aInStringLength - i + 1;
       }
       switch (aInString[i]) // Performance increase
       {
       case '*':
         if (StructPhraseHit(newOffset, newLength, i == 0,
                             MOZ_UTF16("*"), 1,
                             "b", "class=\"moz-txt-star\"",
                             aOutString, structPhrase_strong))
         {
           i++;
           continue;
         }
         break;
       case '/':
         if (StructPhraseHit(newOffset, newLength, i == 0,
                             MOZ_UTF16("/"), 1,
                             "i", "class=\"moz-txt-slash\"",
                             aOutString, structPhrase_italic))
         {
           i++;
           continue;
         }
         break;
       case '_':
         if (StructPhraseHit(newOffset, newLength, i == 0,
                             MOZ_UTF16("_"), 1,
                             "span" /* <u> is deprecated */,
                             "class=\"moz-txt-underscore\"",
                             aOutString, structPhrase_underline))
         {
           i++;
           continue;
         }
         break;
       case '|':
         if (StructPhraseHit(newOffset, newLength, i == 0,
                             MOZ_UTF16("|"), 1,
                             "code", "class=\"moz-txt-verticalline\"",
                             aOutString, structPhrase_code))
         {
           i++;
           continue;
         }
         break;
       }
     }
     if (doURLs)
     {
       switch (aInString[i])
       {
       case ':':
       case '@':
       case '.':
         if ( (i == 0 || ((i > 0) && aInString[i - 1] != ' ')) && aInString[i +1] != ' ') // Performance increase
         {
           int32_t replaceBefore;
           int32_t replaceAfter;
           if (FindURL(aInString, aInStringLength, i, whattodo,
                       outputHTML, replaceBefore, replaceAfter)
                   && structPhrase_strong + structPhrase_italic +
                        structPhrase_underline + structPhrase_code == 0
                        /* workaround for bug #19445 */ )
           {
             aOutString.Cut(aOutString.Length() - replaceBefore, replaceBefore);
             aOutString += outputHTML;
             i += replaceAfter + 1;
             continue;
           }
         }
         break;
       } //switch
     }
     switch (aInString[i])
     {
     // Special symbols
     case '<':
     case '>':
     case '&':
       EscapeChar(aInString[i], aOutString, false);
       i++;
       break;
     // Normal characters
     default:
       aOutString += aInString[i];
       i++;
       break;
     }
   }
 }
 void
 mozTXTToHTMLConv::ScanHTML(nsString& aInString, uint32_t whattodo, nsString &aOutString)
 {
   // some common variables we were recalculating
   // every time inside the for loop...
   int32_t lengthOfInString = aInString.Length();
   const char16_t * uniBuffer = aInString.get();
 #ifdef DEBUG_BenB_Perf
   PRTime parsing_start = PR_IntervalNow();
 #endif
   // Look for simple entities not included in a tags and scan them.
   /* Skip all tags ("<[...]>") and content in an a tag ("<a[...]</a>")
      or in a tag ("<!--[...]-->").
      Unescape the rest (text between tags) and pass it to ScanTXT. */
   for (int32_t i = 0; i < lengthOfInString;)
   {
     if (aInString[i] == '<')  // html tag
     {
       uint32_t start = uint32_t(i);
       if (nsCRT::ToLower((char)aInString[uint32_t(i) + 1]) == 'a')
            // if a tag, skip until </a>
       {
         i = aInString.Find("</a>", true, i);
         if (i == kNotFound)
           i = lengthOfInString;
         else
           i += 4;
       }
       else if (aInString[uint32_t(i) + 1] == '!' && aInString[uint32_t(i) + 2] == '-' &&
         aInString[uint32_t(i) + 3] == '-')
           //if out-commended code, skip until -->
       {
         i = aInString.Find("-->", false, i);
         if (i == kNotFound)
           i = lengthOfInString;
         else
           i += 3;
       }
       else  // just skip tag (attributes etc.)
       {
         i = aInString.FindChar('>', i);
         if (i == kNotFound)
           i = lengthOfInString;
         else
           i++;
       }
       aOutString.Append(&uniBuffer[start], uint32_t(i) - start);
     }
     else
     {
       uint32_t start = uint32_t(i);
       i = aInString.FindChar('<', i);
       if (i == kNotFound)
         i = lengthOfInString;
       nsString tempString;
       tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
       UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
       ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString);
     }
   }
 #ifdef DEBUG_BenB_Perf
   printf("ScanHTML time:    %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
 #endif
 }
 /****************************************************************************
   XPCOM Interface
 *****************************************************************************/
 NS_IMETHODIMP
 mozTXTToHTMLConv::Convert(nsIInputStream *aFromStream,
                           const char *aFromType,
                           const char *aToType,
                           nsISupports *aCtxt, nsIInputStream **_retval)
 {
   return NS_ERROR_NOT_IMPLEMENTED;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::AsyncConvertData(const char *aFromType,
                                    const char *aToType,
                                    nsIStreamListener *aListener, nsISupports *aCtxt) {
   return NS_ERROR_NOT_IMPLEMENTED;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsISupports *ctxt,
                                  nsIInputStream *inStr, uint64_t sourceOffset,
                                  uint32_t count)
 {
   return NS_ERROR_NOT_IMPLEMENTED;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::OnStartRequest(nsIRequest* request, nsISupports *ctxt)
 {
   return NS_ERROR_NOT_IMPLEMENTED;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsISupports *ctxt,
                                 nsresult aStatus)
 {
   return NS_ERROR_NOT_IMPLEMENTED;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line, uint32_t *logLineStart,
 				uint32_t *_retval)
 {
    if (!logLineStart || !_retval || !line)
      return NS_ERROR_NULL_POINTER;
    *_retval = CiteLevelTXT(line, *logLineStart);
    return NS_OK;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::ScanTXT(const char16_t *text, uint32_t whattodo,
 			   char16_t **_retval)
 {
   NS_ENSURE_ARG(text);
   // FIX ME!!!
   nsString outString;
   int32_t inLength = NS_strlen(text);
   // by setting a large capacity up front, we save time
   // when appending characters to the output string because we don't
   // need to reallocate and re-copy the characters already in the out String.
   NS_ASSERTION(inLength, "ScanTXT passed 0 length string");
   if (inLength == 0) {
     *_retval = NS_strdup(text);
     return NS_OK;
   }
   outString.SetCapacity(uint32_t(inLength * growthRate));
   ScanTXT(text, inLength, whattodo, outString);
   *_retval = ToNewUnicode(outString);
   return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
 }
 NS_IMETHODIMP
 mozTXTToHTMLConv::ScanHTML(const char16_t *text, uint32_t whattodo,
 			    char16_t **_retval)
 {
   NS_ENSURE_ARG(text);
   // FIX ME!!!
   nsString outString;
   nsString inString (text); // look at this nasty extra copy of the entire input buffer!
   outString.SetCapacity(uint32_t(inString.Length() * growthRate));
   ScanHTML(inString, whattodo, outString);
   *_retval = ToNewUnicode(outString);
   return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
 }
 nsresult
 MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv)
 {
     NS_PRECONDITION(aConv != nullptr, "null ptr");
     if (!aConv)
       return NS_ERROR_NULL_POINTER;
     *aConv = new mozTXTToHTMLConv();
     if (!*aConv)
       return NS_ERROR_OUT_OF_MEMORY;
     NS_ADDREF(*aConv);
     //    return (*aConv)->Init();
     return NS_OK;
 }

The Tor Browser / annotate

netwerk/streamconv/converters/mozTXTToHTMLConv.cpp@deefc01c0e14 (annotated)

netwerk/streamconv/converters/mozTXTToHTMLConv.cpp