Rebuild with utf-8 encoding

[privoxy.git] / urlmatch.c
diff --git a/urlmatch.c b/urlmatch.c

index b390e35..c731ec3 100644 (file)
--- a/urlmatch.c
+++ b/urlmatch.c
@@ -1,4 +1,4 @@
-const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.57 2009/06/03 16:44:15 fabiankeil Exp $";
+const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.74 2012/12/07 12:49:20 fabiankeil Exp $";
  /*********************************************************************
   *
   * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
@@ -6,7 +6,7 @@ const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.57 2009/06/03 16:44:15 fabianke
   * Purpose     :  Declares functions to match URLs against URL
   *                patterns.
   *
- * Copyright   :  Written by and Copyright (C) 2001-2009
+ * Copyright   :  Written by and Copyright (C) 2001-2011
   *                the Privoxy team. http://www.privoxy.org/
   *
   *                Based on the Internet Junkbuster originally written
@@ -58,7 +58,13 @@ const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.57 2009/06/03 16:44:15 fabianke
  
  const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  
-enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
+enum regex_anchoring
+{
+   NO_ANCHORING,
+   LEFT_ANCHORED,
+   RIGHT_ANCHORED,
+   RIGHT_ANCHORED_HOST
+};
  static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  
  /*********************************************************************
@@ -109,7 +115,6 @@ void free_http_request(struct http_request *http)
   *          1  :  http = pointer to the http structure to hold elements.
   *
   * Returns     :  JB_ERR_OK on success
- *                JB_ERR_MEMORY on out of memory
   *                JB_ERR_PARSE on malformed command/URL
   *                             or >100 domains deep.
   *
@@ -120,20 +125,16 @@ jb_err init_domain_components(struct http_request *http)
     size_t size;
     char *p;
  
-   http->dbuffer = strdup(http->host);
-   if (NULL == http->dbuffer)
-   {
-      return JB_ERR_MEMORY;
-   }
+   http->dbuffer = strdup_or_die(http->host);
  
     /* map to lower case */
     for (p = http->dbuffer; *p ; p++)
     {
-      *p = (char)tolower((int)(unsigned char)*p);
+      *p = (char)privoxy_tolower(*p);
     }
  
     /* split the domain name into components */
-   http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
+   http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
  
     if (http->dcount <= 0)
     {
@@ -148,11 +149,7 @@ jb_err init_domain_components(struct http_request *http)
     /* save a copy of the pointers in dvec */
     size = (size_t)http->dcount * sizeof(*http->dvec);
  
-   http->dvec = (char **)malloc(size);
-   if (NULL == http->dvec)
-   {
-      return JB_ERR_MEMORY;
-   }
+   http->dvec = malloc_or_die(size);
  
     memcpy(http->dvec, vec, size);
  
@@ -161,6 +158,53 @@ jb_err init_domain_components(struct http_request *http)
  #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
  
  
+/*********************************************************************
+ *
+ * Function    :  url_requires_percent_encoding
+ *
+ * Description :  Checks if an URL contains invalid characters
+ *                according to RFC 3986 that should be percent-encoded.
+ *                Does not verify whether or not the passed string
+ *                actually is a valid URL.
+ *
+ * Parameters  :
+ *          1  :  url = URL to check
+ *
+ * Returns     :  True in case of valid URLs, false otherwise
+ *
+ *********************************************************************/
+int url_requires_percent_encoding(const char *url)
+{
+   static const char allowed_characters[128] = {
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
+      '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
+      '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
+      '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
+      'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
+      'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
+      'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
+      'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
+      'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
+      'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
+   };
+
+   while (*url != '\0')
+   {
+      const unsigned int i = (unsigned char)*url++;
+      if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
+      {
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+
+}
+
+
  /*********************************************************************
   *
   * Function    :  parse_http_url
@@ -176,7 +220,6 @@ jb_err init_domain_components(struct http_request *http)
   *                                   protocol are acceptable.
   *
   * Returns     :  JB_ERR_OK on success
- *                JB_ERR_MEMORY on out of memory
   *                JB_ERR_PARSE on malformed command/URL
   *                             or >100 domains deep.
   *
@@ -188,23 +231,15 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
     /*
      * Save our initial URL
      */
-   http->url = strdup(url);
-   if (http->url == NULL)
-   {
-      return JB_ERR_MEMORY;
-   }
-
+   http->url = strdup_or_die(url);
  
     /*
      * Check for * URI. If found, we're done.
-    */  
+    */
     if (*http->url == '*')
     {
-      if  ( NULL == (http->path = strdup("*"))
-         || NULL == (http->hostport = strdup("")) ) 
-      {
-         return JB_ERR_MEMORY;
-      }
+      http->path = strdup_or_die("*");
+      http->hostport = strdup_or_die("");
        if (http->url[1] != '\0')
        {
           return JB_ERR_PARSE;
@@ -221,11 +256,7 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
        char *url_noproto;
        char *url_path;
  
-      buf = strdup(url);
-      if (buf == NULL)
-      {
-         return JB_ERR_MEMORY;
-      }
+      buf = strdup_or_die(url);
  
        /* Find the start of the URL in our scratch space */
        url_noproto = buf;
@@ -268,9 +299,9 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
            * https URL in and it's parsed by the function.  (When the
            * URL is actually retrieved, SSL hides the path part).
            */
-         http->path = strdup(http->ssl ? "/" : url_path);
+         http->path = strdup_or_die(http->ssl ? "/" : url_path);
           *url_path = '\0';
-         http->hostport = strdup(url_noproto);
+         http->hostport = strdup_or_die(url_noproto);
        }
        else
        {
@@ -278,17 +309,11 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
            * Repair broken HTTP requests that don't contain a path,
            * or CONNECT requests
            */
-         http->path = strdup("/");
-         http->hostport = strdup(url_noproto);
+         http->path = strdup_or_die("/");
+         http->hostport = strdup_or_die(url_noproto);
        }
  
        freez(buf);
-
-      if ( (http->path == NULL)
-        || (http->hostport == NULL))
-      {
-         return JB_ERR_MEMORY;
-      }
     }
  
     if (!host_available)
@@ -305,11 +330,7 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
        char *host;
        char *port;
  
-      buf = strdup(http->hostport);
-      if (buf == NULL)
-      {
-         return JB_ERR_MEMORY;
-      }
+      buf = strdup_or_die(http->hostport);
  
        /* check if url contains username and/or password */
        host = strchr(buf, '@');
@@ -361,9 +382,18 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
        if (port != NULL)
        {
           /* Contains port */
+         char *endptr;
+         long parsed_port;
           /* Terminate hostname and point to start of port string */
           *port++ = '\0';
-         http->port = atoi(port);
+         parsed_port = strtol(port, &endptr, 10);
+         if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
+         {
+            log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
+            freez(buf);
+            return JB_ERR_PARSE;
+         }
+         http->port = (int)parsed_port;
        }
        else
        {
@@ -371,14 +401,9 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
           http->port = (http->ssl ? 443 : 80);
        }
  
-      http->host = strdup(host);
+      http->host = strdup_or_die(host);
  
        freez(buf);
-
-      if (http->host == NULL)
-      {
-         return JB_ERR_MEMORY;
-      }
     }
  
  #ifdef FEATURE_EXTENDED_HOST_PATTERNS
@@ -405,7 +430,7 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
   *********************************************************************/
  static int unknown_method(const char *method)
  {
-   static const char *known_http_methods[] = {
+   static const char * const known_http_methods[] = {
        /* Basic HTTP request type */
        "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
        /* webDAV extensions (RFC2518) */
@@ -414,14 +439,14 @@ static int unknown_method(const char *method)
         * Microsoft webDAV extension for Exchange 2000.  See:
         * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
         * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
-       */ 
+       */
        "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
        /*
         * Another Microsoft webDAV extension for Exchange 2000.  See:
         * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
         * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
         * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
-       */ 
+       */
        "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
        /*
         * Yet another WebDAV extension, this time for
@@ -457,7 +482,6 @@ static int unknown_method(const char *method)
   *          2  :  http = pointer to the http structure to hold elements
   *
   * Returns     :  JB_ERR_OK on success
- *                JB_ERR_MEMORY on out of memory
   *                JB_ERR_CGI_PARAMS on malformed command/URL
   *                                  or >100 domains deep.
   *
@@ -471,13 +495,9 @@ jb_err parse_http_request(const char *req, struct http_request *http)
  
     memset(http, '\0', sizeof(*http));
  
-   buf = strdup(req);
-   if (buf == NULL)
-   {
-      return JB_ERR_MEMORY;
-   }
+   buf = strdup_or_die(req);
  
-   n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
+   n = ssplit(buf, " \r\n", v, SZ(v));
     if (n != 3)
     {
        freez(buf);
@@ -520,19 +540,12 @@ jb_err parse_http_request(const char *req, struct http_request *http)
     /*
      * Copy the details into the structure
      */
-   http->cmd = strdup(req);
-   http->gpc = strdup(v[0]);
-   http->ver = strdup(v[2]);
+   http->cmd = strdup_or_die(req);
+   http->gpc = strdup_or_die(v[0]);
+   http->ver = strdup_or_die(v[2]);
  
     freez(buf);
  
-   if ( (http->cmd == NULL)
-     || (http->gpc == NULL)
-     || (http->ver == NULL) )
-   {
-      return JB_ERR_MEMORY;
-   }
-
     return JB_ERR_OK;
  
  }
@@ -546,9 +559,10 @@ jb_err parse_http_request(const char *req, struct http_request *http)
   *
   * Parameters  :
   *          1  :  pattern = The pattern to compile.
- *          2  :  anchoring = How the regex should be anchored.
- *                            Can be either one of NO_ANCHORING,
- *                            LEFT_ANCHORED or RIGHT_ANCHORED.
+ *          2  :  anchoring = How the regex should be modified
+ *                            before compilation. Can be either
+ *                            one of NO_ANCHORING, LEFT_ANCHORED,
+ *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
   *          3  :  url     = In case of failures, the spec member is
   *                          logged and the structure freed.
   *          4  :  regex   = Where the compiled regex should be stored.
@@ -582,6 +596,9 @@ static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchorin
        case RIGHT_ANCHORED:
           fmt = "%s$";
           break;
+      case RIGHT_ANCHORED_HOST:
+         fmt = "%s\\.?$";
+         break;
        case LEFT_ANCHORED:
           fmt = "^%s";
           break;
@@ -692,11 +709,7 @@ static jb_err compile_url_pattern(struct url_spec *url, char *buf)
     if (NULL != p)
     {
        *p++ = '\0';
-      url->port_list = strdup(p);
-      if (NULL == url->port_list)
-      {
-         return JB_ERR_MEMORY;
-      }
+      url->port_list = strdup_or_die(p);
     }
     else
     {
@@ -718,7 +731,7 @@ static jb_err compile_url_pattern(struct url_spec *url, char *buf)
   *
   * Function    :  compile_host_pattern
   *
- * Description :  Parses and compiles a host pattern..
+ * Description :  Parses and compiles a host pattern.
   *
   * Parameters  :
   *          1  :  url = Target url_spec to be filled in.
@@ -731,7 +744,7 @@ static jb_err compile_url_pattern(struct url_spec *url, char *buf)
   *********************************************************************/
  static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
  {
-   return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
+   return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
  }
  
  #else
@@ -747,7 +760,6 @@ static jb_err compile_host_pattern(struct url_spec *url, const char *host_patter
   *          2  :  host_pattern = Host pattern to parse.
   *
   * Returns     :  JB_ERR_OK - Success
- *                JB_ERR_MEMORY - Out of memory
   *                JB_ERR_PARSE - Cannot parse regex
   *
   *********************************************************************/
@@ -769,47 +781,37 @@ static jb_err compile_host_pattern(struct url_spec *url, const char *host_patter
        url->unanchored |= ANCHOR_LEFT;
     }
  
-   /* 
+   /*
      * Split domain into components
      */
-   url->dbuffer = strdup(host_pattern);
-   if (NULL == url->dbuffer)
-   {
-      free_url_spec(url);
-      return JB_ERR_MEMORY;
-   }
+   url->dbuffer = strdup_or_die(host_pattern);
  
-   /* 
+   /*
      * Map to lower case
      */
     for (p = url->dbuffer; *p ; p++)
     {
-      *p = (char)tolower((int)(unsigned char)*p);
+      *p = (char)privoxy_tolower(*p);
     }
  
-   /* 
+   /*
      * Split the domain name into components
      */
-   url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
+   url->dcount = ssplit(url->dbuffer, ".", v, SZ(v));
  
     if (url->dcount < 0)
     {
        free_url_spec(url);
-      return JB_ERR_MEMORY;
+      return JB_ERR_PARSE;
     }
     else if (url->dcount != 0)
     {
-      /* 
+      /*
         * Save a copy of the pointers in dvec
         */
        size = (size_t)url->dcount * sizeof(*url->dvec);
-      
-      url->dvec = (char **)malloc(size);
-      if (NULL == url->dvec)
-      {
-         free_url_spec(url);
-         return JB_ERR_MEMORY;
-      }
+
+      url->dvec = malloc_or_die(size);
  
        memcpy(url->dvec, v, size);
     }
@@ -842,13 +844,13 @@ static int simplematch(const char *pattern, const char *text)
  {
     const unsigned char *pat = (const unsigned char *)pattern;
     const unsigned char *txt = (const unsigned char *)text;
-   const unsigned char *fallback = pat; 
+   const unsigned char *fallback = pat;
     int wildcard = 0;
-  
+
     unsigned char lastchar = 'a';
     unsigned i;
     unsigned char charmap[32];
-  
+
     while (*txt)
     {
  
@@ -866,15 +868,15 @@ static int simplematch(const char *pattern, const char *text)
        }
  
        /* '*' in the pattern?  */
-      if (*pat == '*') 
+      if (*pat == '*')
        {
-     
+
           /* The pattern ends afterwards? Speed up the return. */
           if (*++pat == '\0')
           {
              return 0;
           }
-     
+
           /* Else, set wildcard mode and remember position after '*' */
           wildcard = 1;
           fallback = pat;
@@ -888,7 +890,7 @@ static int simplematch(const char *pattern, const char *text)
           while (*++pat != ']')
           {
              if (!*pat)
-            { 
+            {
                 return 1;
              }
              else if (*pat == '-')
@@ -900,7 +902,7 @@ static int simplematch(const char *pattern, const char *text)
                 for (i = lastchar; i <= *pat; i++)
                 {
                    charmap[i / 8] |= (unsigned char)(1 << (i % 8));
-               } 
+               }
              }
              else
              {
@@ -911,21 +913,21 @@ static int simplematch(const char *pattern, const char *text)
        } /* -END- if Character range specification */
  
  
-      /* 
-       * Char match, or char range match? 
+      /*
+       * Char match, or char range match?
         */
-      if ( (*pat == *txt)
-      ||   (*pat == '?')
-      ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
+      if ((*pat == *txt)
+       || (*pat == '?')
+       || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
        {
-         /* 
-          * Sucess: Go ahead
+         /*
+          * Success: Go ahead
            */
           pat++;
        }
        else if (!wildcard)
        {
-         /* 
+         /*
            * No match && no wildcard: No luck
            */
           return 1;
@@ -952,7 +954,7 @@ static int simplematch(const char *pattern, const char *text)
     }
  
     /* Cut off extra '*'s */
-   if(*pat == '*')  pat++;
+   if (*pat == '*') pat++;
  
     /* If this is the pattern's end, fine! */
     return(*pat);
@@ -1098,7 +1100,6 @@ static int domain_match(const struct url_spec *pattern, const struct http_reques
   *                      are lost forever.
   *
   * Returns     :  JB_ERR_OK - Success
- *                JB_ERR_MEMORY - Out of memory
   *                JB_ERR_PARSE - Cannot parse regex (Detailed message
   *                               written to system log)
   *
@@ -1111,14 +1112,10 @@ jb_err create_url_spec(struct url_spec *url, char *buf)
     memset(url, '\0', sizeof(*url));
  
     /* Remember the original specification for the CGI pages. */
-   url->spec = strdup(buf);
-   if (NULL == url->spec)
-   {
-      return JB_ERR_MEMORY;
-   }
+   url->spec = strdup_or_die(buf);
  
     /* Is it a tag pattern? */
-   if (0 == strncmpic("TAG:", url->spec, 4))
+   if (0 == strncmpic(url->spec, "TAG:", 4))
     {
        /* The pattern starts with the first character after "TAG:" */
        const char *tag_pattern = buf + 4;
@@ -1257,7 +1254,7 @@ int url_match(const struct url_spec *pattern,
     {
        /* It's a tag pattern and shouldn't be matched against URLs */
        return 0;
-   } 
+   }
  
     return (port_matches(http->port, pattern->port_list)
        && host_matches(http, pattern) && path_matches(http->path, pattern));
@@ -1284,7 +1281,7 @@ int match_portlist(const char *portlist, int port)
  {
     char *min, *max, *next, *portlist_copy;
  
-   min = portlist_copy = strdup(portlist);
+   min = portlist_copy = strdup_or_die(portlist);
  
     /*
      * Zero-terminate first item and remember offset for next
@@ -1317,7 +1314,7 @@ int match_portlist(const char *portlist, int port)
            * or, if max was omitted, between min and 65K
            */
           *max++ = '\0';
-         if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
+         if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
           {
              freez(portlist_copy);
              return(1);
@@ -1372,11 +1369,7 @@ jb_err parse_forwarder_address(char *address, char **hostname, int *port)
        return JB_ERR_PARSE;
     }
  
-   *hostname = strdup(address);
-   if (NULL == *hostname)
-   {
-      return JB_ERR_MEMORY;
-   }
+   *hostname = strdup_or_die(address);
  
     if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
     {