michael@0: /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0:  
michael@0: //	First checked in on 98/12/03 by John R. McMullen, derived from net.h/mkparse.c.
michael@0: 
michael@0: #include "nsEscape.h"
michael@0: #include "nsMemory.h"
michael@0: #include "nsCRT.h"
michael@0: #include "nsReadableUtils.h"
michael@0: 
michael@0: const int netCharType[256] =
michael@0: /*	Bit 0		xalpha		-- the alphas
michael@0: **	Bit 1		xpalpha		-- as xalpha but 
michael@0: **                             converts spaces to plus and plus to %2B
michael@0: **	Bit 3 ...	path		-- as xalphas but doesn't escape '/'
michael@0: */
michael@0:     /*   0 1 2 3 4 5 6 7 8 9 A B C D E F */
michael@0:     {    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	/* 0x */
michael@0: 		 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	/* 1x */
michael@0: 		 0,0,0,0,0,0,0,0,0,0,7,4,0,7,7,4,	/* 2x   !"#$%&'()*+,-./	 */
michael@0:          7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0,	/* 3x  0123456789:;<=>?	 */
michael@0: 	     0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,	/* 4x  @ABCDEFGHIJKLMNO  */
michael@0: 	     /* bits for '@' changed from 7 to 0 so '@' can be escaped   */
michael@0: 	     /* in usernames and passwords in publishing.                */
michael@0: 	     7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7,	/* 5X  PQRSTUVWXYZ[\]^_	 */
michael@0: 	     0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,	/* 6x  `abcdefghijklmno	 */
michael@0: 	     7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,	/* 7X  pqrstuvwxyz{\}~	DEL */
michael@0: 		 0, };
michael@0: 
michael@0: /* decode % escaped hex codes into character values
michael@0:  */
michael@0: #define UNHEX(C) \
michael@0:     ((C >= '0' && C <= '9') ? C - '0' : \
michael@0:      ((C >= 'A' && C <= 'F') ? C - 'A' + 10 : \
michael@0:      ((C >= 'a' && C <= 'f') ? C - 'a' + 10 : 0)))
michael@0: 
michael@0: 
michael@0: #define IS_OK(C) (netCharType[((unsigned int) (C))] & (flags))
michael@0: #define HEX_ESCAPE '%'
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: static char* nsEscapeCount(
michael@0:     const char * str,
michael@0:     nsEscapeMask flags,
michael@0:     size_t* out_len)
michael@0: //----------------------------------------------------------------------------------------
michael@0: {
michael@0: 	if (!str)
michael@0: 		return 0;
michael@0: 
michael@0:     size_t i, len = 0, charsToEscape = 0;
michael@0:     static const char hexChars[] = "0123456789ABCDEF";
michael@0: 
michael@0: 	const unsigned char* src = (const unsigned char *) str;
michael@0:     while (*src)
michael@0: 	{
michael@0:         len++;
michael@0:         if (!IS_OK(*src++))
michael@0:             charsToEscape++;
michael@0: 	}
michael@0: 
michael@0:     // calculate how much memory should be allocated
michael@0:     // original length + 2 bytes for each escaped character + terminating '\0'
michael@0:     // do the sum in steps to check for overflow
michael@0:     size_t dstSize = len + 1 + charsToEscape;
michael@0:     if (dstSize <= len)
michael@0: 	return 0;
michael@0:     dstSize += charsToEscape;
michael@0:     if (dstSize < len)
michael@0: 	return 0;
michael@0: 
michael@0:     // fail if we need more than 4GB
michael@0:     // size_t is likely to be long unsigned int but nsMemory::Alloc(size_t)
michael@0:     // calls NS_Alloc_P(size_t) which calls PR_Malloc(uint32_t), so there is
michael@0:     // no chance to allocate more than 4GB using nsMemory::Alloc()
michael@0:     if (dstSize > UINT32_MAX)
michael@0:         return 0;
michael@0: 
michael@0: 	char* result = (char *)nsMemory::Alloc(dstSize);
michael@0:     if (!result)
michael@0:         return 0;
michael@0: 
michael@0:     unsigned char* dst = (unsigned char *) result;
michael@0: 	src = (const unsigned char *) str;
michael@0: 	if (flags == url_XPAlphas)
michael@0: 	{
michael@0: 	    for (i = 0; i < len; i++)
michael@0: 		{
michael@0: 			unsigned char c = *src++;
michael@0: 			if (IS_OK(c))
michael@0: 				*dst++ = c;
michael@0: 			else if (c == ' ')
michael@0: 				*dst++ = '+'; /* convert spaces to pluses */
michael@0: 			else 
michael@0: 			{
michael@0: 				*dst++ = HEX_ESCAPE;
michael@0: 				*dst++ = hexChars[c >> 4];	/* high nibble */
michael@0: 				*dst++ = hexChars[c & 0x0f];	/* low nibble */
michael@0: 			}
michael@0: 		}
michael@0: 	}
michael@0: 	else
michael@0: 	{
michael@0: 	    for (i = 0; i < len; i++)
michael@0: 		{
michael@0: 			unsigned char c = *src++;
michael@0: 			if (IS_OK(c))
michael@0: 				*dst++ = c;
michael@0: 			else 
michael@0: 			{
michael@0: 				*dst++ = HEX_ESCAPE;
michael@0: 				*dst++ = hexChars[c >> 4];	/* high nibble */
michael@0: 				*dst++ = hexChars[c & 0x0f];	/* low nibble */
michael@0: 			}
michael@0: 		}
michael@0: 	}
michael@0: 
michael@0:     *dst = '\0';     /* tack on eos */
michael@0: 	if(out_len)
michael@0: 		*out_len = dst - (unsigned char *) result;
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: char* nsEscape(const char * str, nsEscapeMask flags)
michael@0: //----------------------------------------------------------------------------------------
michael@0: {
michael@0:     if(!str)
michael@0:         return nullptr;
michael@0:     return nsEscapeCount(str, flags, nullptr);
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: char* nsUnescape(char * str)
michael@0: //----------------------------------------------------------------------------------------
michael@0: {
michael@0: 	nsUnescapeCount(str);
michael@0: 	return str;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: int32_t nsUnescapeCount(char * str)
michael@0: //----------------------------------------------------------------------------------------
michael@0: {
michael@0:     char *src = str;
michael@0:     char *dst = str;
michael@0:     static const char hexChars[] = "0123456789ABCDEFabcdef";
michael@0: 
michael@0:     char c1[] = " ";
michael@0:     char c2[] = " ";
michael@0:     char* const pc1 = c1;
michael@0:     char* const pc2 = c2;
michael@0: 
michael@0:     if (!*src) {
michael@0:       // A null string was passed in.  Nothing to escape.
michael@0:       // Returns early as the string might not actually be mutable with
michael@0:       // length 0.
michael@0:       return 0;
michael@0:     }
michael@0: 
michael@0:     while (*src)
michael@0:     {
michael@0:         c1[0] = *(src+1);
michael@0:         if (*(src+1) == '\0') 
michael@0:             c2[0] = '\0';
michael@0:         else
michael@0:             c2[0] = *(src+2);
michael@0: 
michael@0:         if (*src != HEX_ESCAPE || PL_strpbrk(pc1, hexChars) == 0 || 
michael@0:                                   PL_strpbrk(pc2, hexChars) == 0 )
michael@0:         	*dst++ = *src++;
michael@0:         else 	
michael@0: 		{
michael@0:         	src++; /* walk over escape */
michael@0:         	if (*src)
michael@0:             {
michael@0:             	*dst = UNHEX(*src) << 4;
michael@0:             	src++;
michael@0:             }
michael@0:         	if (*src)
michael@0:             {
michael@0:             	*dst = (*dst + UNHEX(*src));
michael@0:             	src++;
michael@0:             }
michael@0:         	dst++;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     *dst = 0;
michael@0:     return (int)(dst - str);
michael@0: 
michael@0: } /* NET_UnEscapeCnt */
michael@0: 
michael@0: 
michael@0: char *
michael@0: nsEscapeHTML(const char * string)
michael@0: {
michael@0:     char *rv = nullptr;
michael@0:     /* XXX Hardcoded max entity len. The +1 is for the trailing null. */
michael@0:     uint32_t len = strlen(string);
michael@0:     if (len >= (UINT32_MAX / 6))
michael@0:       return nullptr;
michael@0: 
michael@0:     rv = (char *)NS_Alloc( (6 * len) + 1 );
michael@0:     char *ptr = rv;
michael@0: 
michael@0:     if(rv)
michael@0:       {
michael@0:         for(; *string != '\0'; string++)
michael@0:           {
michael@0:             if(*string == '<')
michael@0:               {
michael@0:                 *ptr++ = '&';
michael@0:                 *ptr++ = 'l';
michael@0:                 *ptr++ = 't';
michael@0:                 *ptr++ = ';';
michael@0:               }
michael@0:             else if(*string == '>')
michael@0:               {
michael@0:                 *ptr++ = '&';
michael@0:                 *ptr++ = 'g';
michael@0:                 *ptr++ = 't';
michael@0:                 *ptr++ = ';';
michael@0:               }
michael@0:             else if(*string == '&')
michael@0:               {
michael@0:                 *ptr++ = '&';
michael@0:                 *ptr++ = 'a';
michael@0:                 *ptr++ = 'm';
michael@0:                 *ptr++ = 'p';
michael@0:                 *ptr++ = ';';
michael@0:               }
michael@0:             else if (*string == '"')
michael@0:               {
michael@0:                 *ptr++ = '&';
michael@0:                 *ptr++ = 'q';
michael@0:                 *ptr++ = 'u';
michael@0:                 *ptr++ = 'o';
michael@0:                 *ptr++ = 't';
michael@0:                 *ptr++ = ';';
michael@0:               }
michael@0:             else if (*string == '\'')
michael@0:               {
michael@0:                 *ptr++ = '&';
michael@0:                 *ptr++ = '#';
michael@0:                 *ptr++ = '3';
michael@0:                 *ptr++ = '9';
michael@0:                 *ptr++ = ';';
michael@0:               }
michael@0:             else
michael@0:               {
michael@0:                 *ptr++ = *string;
michael@0:               }
michael@0:           }
michael@0:         *ptr = '\0';
michael@0:       }
michael@0: 
michael@0:     return(rv);
michael@0: }
michael@0: 
michael@0: char16_t *
michael@0: nsEscapeHTML2(const char16_t *aSourceBuffer, int32_t aSourceBufferLen)
michael@0: {
michael@0:   // Calculate the length, if the caller didn't.
michael@0:   if (aSourceBufferLen < 0) {
michael@0:     aSourceBufferLen = NS_strlen(aSourceBuffer);
michael@0:   }
michael@0: 
michael@0:   /* XXX Hardcoded max entity len. */
michael@0:   if (uint32_t(aSourceBufferLen) >=
michael@0:       ((UINT32_MAX - sizeof(char16_t)) / (6 * sizeof(char16_t))) )
michael@0:     return nullptr;
michael@0: 
michael@0:   char16_t *resultBuffer = (char16_t *)nsMemory::Alloc(aSourceBufferLen *
michael@0:                             6 * sizeof(char16_t) + sizeof(char16_t('\0')));
michael@0:   char16_t *ptr = resultBuffer;
michael@0: 
michael@0:   if (resultBuffer) {
michael@0:     int32_t i;
michael@0: 
michael@0:     for(i = 0; i < aSourceBufferLen; i++) {
michael@0:       if(aSourceBuffer[i] == '<') {
michael@0:         *ptr++ = '&';
michael@0:         *ptr++ = 'l';
michael@0:         *ptr++ = 't';
michael@0:         *ptr++ = ';';
michael@0:       } else if(aSourceBuffer[i] == '>') {
michael@0:         *ptr++ = '&';
michael@0:         *ptr++ = 'g';
michael@0:         *ptr++ = 't';
michael@0:         *ptr++ = ';';
michael@0:       } else if(aSourceBuffer[i] == '&') {
michael@0:         *ptr++ = '&';
michael@0:         *ptr++ = 'a';
michael@0:         *ptr++ = 'm';
michael@0:         *ptr++ = 'p';
michael@0:         *ptr++ = ';';
michael@0:       } else if (aSourceBuffer[i] == '"') {
michael@0:         *ptr++ = '&';
michael@0:         *ptr++ = 'q';
michael@0:         *ptr++ = 'u';
michael@0:         *ptr++ = 'o';
michael@0:         *ptr++ = 't';
michael@0:         *ptr++ = ';';
michael@0:       } else if (aSourceBuffer[i] == '\'') {
michael@0:         *ptr++ = '&';
michael@0:         *ptr++ = '#';
michael@0:         *ptr++ = '3';
michael@0:         *ptr++ = '9';
michael@0:         *ptr++ = ';';
michael@0:       } else {
michael@0:         *ptr++ = aSourceBuffer[i];
michael@0:       }
michael@0:     }
michael@0:     *ptr = 0;
michael@0:   }
michael@0: 
michael@0:   return resultBuffer;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: 
michael@0: const int EscapeChars[256] =
michael@0: /*      0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F */
michael@0: {
michael@0:         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,       /* 0x */
michael@0:         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  	    /* 1x */
michael@0:         0,1023,   0, 512,1023,   0,1023,   0,1023,1023,1023,1023,1023,1023, 953, 784,       /* 2x   !"#$%&'()*+,-./	 */
michael@0:      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1008,1008,   0,1008,   0, 768,       /* 3x  0123456789:;<=>?	 */
michael@0:      1008,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,       /* 4x  @ABCDEFGHIJKLMNO  */
michael@0:      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896, 896, 896, 896,1023,       /* 5x  PQRSTUVWXYZ[\]^_	 */
michael@0:         0,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,       /* 6x  `abcdefghijklmno	 */
michael@0:      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896,1012, 896,1023,   0,       /* 7x  pqrstuvwxyz{|}~	 */
michael@0:         0    /* 8x  DEL               */
michael@0: };
michael@0: 
michael@0: #define NO_NEED_ESC(C) (EscapeChars[((unsigned int) (C))] & (flags))
michael@0: 
michael@0: //----------------------------------------------------------------------------------------
michael@0: 
michael@0: /* returns an escaped string */
michael@0: 
michael@0: /* use the following flags to specify which 
michael@0:    part of an URL you want to escape: 
michael@0: 
michael@0:    esc_Scheme        =     1
michael@0:    esc_Username      =     2
michael@0:    esc_Password      =     4
michael@0:    esc_Host          =     8
michael@0:    esc_Directory     =    16
michael@0:    esc_FileBaseName  =    32
michael@0:    esc_FileExtension =    64
michael@0:    esc_Param         =   128
michael@0:    esc_Query         =   256
michael@0:    esc_Ref           =   512
michael@0: */
michael@0: 
michael@0: /* by default this function will not escape parts of a string
michael@0:    that already look escaped, which means it already includes 
michael@0:    a valid hexcode. This is done to avoid multiple escapes of
michael@0:    a string. Use the following flags to force escaping of a 
michael@0:    string:
michael@0:  
michael@0:    esc_Forced        =  1024
michael@0: */
michael@0: 
michael@0: bool NS_EscapeURL(const char *part,
michael@0:                            int32_t partLen,
michael@0:                            uint32_t flags,
michael@0:                            nsACString &result)
michael@0: {
michael@0:     if (!part) {
michael@0:         NS_NOTREACHED("null pointer");
michael@0:         return false;
michael@0:     }
michael@0: 
michael@0:     int i = 0;
michael@0:     static const char hexChars[] = "0123456789ABCDEF";
michael@0:     if (partLen < 0)
michael@0:         partLen = strlen(part);
michael@0:     bool forced = !!(flags & esc_Forced);
michael@0:     bool ignoreNonAscii = !!(flags & esc_OnlyASCII);
michael@0:     bool ignoreAscii = !!(flags & esc_OnlyNonASCII);
michael@0:     bool writing = !!(flags & esc_AlwaysCopy);
michael@0:     bool colon = !!(flags & esc_Colon);
michael@0: 
michael@0:     const unsigned char* src = (const unsigned char *) part;
michael@0: 
michael@0:     char tempBuffer[100];
michael@0:     unsigned int tempBufferPos = 0;
michael@0: 
michael@0:     bool previousIsNonASCII = false;
michael@0:     for (i = 0; i < partLen; i++)
michael@0:     {
michael@0:       unsigned char c = *src++;
michael@0: 
michael@0:       // if the char has not to be escaped or whatever follows % is 
michael@0:       // a valid escaped string, just copy the char.
michael@0:       //
michael@0:       // Also the % will not be escaped until forced
michael@0:       // See bugzilla bug 61269 for details why we changed this
michael@0:       //
michael@0:       // And, we will not escape non-ascii characters if requested.
michael@0:       // On special request we will also escape the colon even when
michael@0:       // not covered by the matrix.
michael@0:       // ignoreAscii is not honored for control characters (C0 and DEL)
michael@0:       //
michael@0:       // And, we should escape the '|' character when it occurs after any
michael@0:       // non-ASCII character as it may be part of a multi-byte character.
michael@0:       //
michael@0:       // 0x20..0x7e are the valid ASCII characters. We also escape spaces
michael@0:       // (0x20) since they are not legal in URLs.
michael@0:       if ((NO_NEED_ESC(c) || (c == HEX_ESCAPE && !forced)
michael@0:                           || (c > 0x7f && ignoreNonAscii)
michael@0:                           || (c > 0x20 && c < 0x7f && ignoreAscii))
michael@0:           && !(c == ':' && colon)
michael@0:           && !(previousIsNonASCII && c == '|' && !ignoreNonAscii))
michael@0:       {
michael@0:         if (writing)
michael@0:           tempBuffer[tempBufferPos++] = c;
michael@0:       }
michael@0:       else /* do the escape magic */
michael@0:       {
michael@0:         if (!writing)
michael@0:         {
michael@0:           result.Append(part, i);
michael@0:           writing = true;
michael@0:         }
michael@0:         tempBuffer[tempBufferPos++] = HEX_ESCAPE;
michael@0:         tempBuffer[tempBufferPos++] = hexChars[c >> 4];	/* high nibble */
michael@0:         tempBuffer[tempBufferPos++] = hexChars[c & 0x0f]; /* low nibble */
michael@0:       }
michael@0: 
michael@0:       if (tempBufferPos >= sizeof(tempBuffer) - 4)
michael@0:       {
michael@0:         NS_ASSERTION(writing, "should be writing");
michael@0:         tempBuffer[tempBufferPos] = '\0';
michael@0:         result += tempBuffer;
michael@0:         tempBufferPos = 0;
michael@0:       }
michael@0: 
michael@0:       previousIsNonASCII = (c > 0x7f);
michael@0:     }
michael@0:     if (writing) {
michael@0:       tempBuffer[tempBufferPos] = '\0';
michael@0:       result += tempBuffer;
michael@0:     }
michael@0:     return writing;
michael@0: }
michael@0: 
michael@0: #define ISHEX(c) memchr(hexChars, c, sizeof(hexChars)-1)
michael@0: 
michael@0: bool NS_UnescapeURL(const char *str, int32_t len, uint32_t flags, nsACString &result)
michael@0: {
michael@0:     if (!str) {
michael@0:         NS_NOTREACHED("null pointer");
michael@0:         return false;
michael@0:     }
michael@0: 
michael@0:     if (len < 0)
michael@0:         len = strlen(str);
michael@0: 
michael@0:     bool ignoreNonAscii = !!(flags & esc_OnlyASCII);
michael@0:     bool ignoreAscii = !!(flags & esc_OnlyNonASCII);
michael@0:     bool writing = !!(flags & esc_AlwaysCopy);
michael@0:     bool skipControl = !!(flags & esc_SkipControl); 
michael@0: 
michael@0:     static const char hexChars[] = "0123456789ABCDEFabcdef";
michael@0: 
michael@0:     const char *last = str;
michael@0:     const char *p = str;
michael@0: 
michael@0:     for (int i=0; i<len; ++i, ++p) {
michael@0:         //printf("%c [i=%d of len=%d]\n", *p, i, len);
michael@0:         if (*p == HEX_ESCAPE && i < len-2) {
michael@0:             unsigned char *p1 = ((unsigned char *) p) + 1;
michael@0:             unsigned char *p2 = ((unsigned char *) p) + 2;
michael@0:             if (ISHEX(*p1) && ISHEX(*p2) && 
michael@0:                 ((*p1 < '8' && !ignoreAscii) || (*p1 >= '8' && !ignoreNonAscii)) &&
michael@0:                 !(skipControl && 
michael@0:                   (*p1 < '2' || (*p1 == '7' && (*p2 == 'f' || *p2 == 'F'))))) {
michael@0:                 //printf("- p1=%c p2=%c\n", *p1, *p2);
michael@0:                 writing = true;
michael@0:                 if (p > last) {
michael@0:                     //printf("- p=%p, last=%p\n", p, last);
michael@0:                     result.Append(last, p - last);
michael@0:                     last = p;
michael@0:                 }
michael@0:                 char u = (UNHEX(*p1) << 4) + UNHEX(*p2);
michael@0:                 //printf("- u=%c\n", u);
michael@0:                 result.Append(u);
michael@0:                 i += 2;
michael@0:                 p += 2;
michael@0:                 last += 3;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     if (writing && last < str + len)
michael@0:         result.Append(last, str + len - last);
michael@0: 
michael@0:     return writing;
michael@0: }