X-Git-Url: http://www.privoxy.org/gitweb/?a=blobdiff_plain;f=urlmatch.c;h=3a9c865784a75f3eed315be14e35dcbd887a6237;hb=430b87b899c9294875101eee7faca82880a111e1;hp=d888e37dbf0ed86e3889218f647488e80e703fb2;hpb=e3c7acd6fdb41505246948ee9cb09f9b4915e027;p=privoxy.git

diff --git a/urlmatch.c b/urlmatch.c
index d888e37d..3a9c8657 100644
--- a/urlmatch.c
+++ b/urlmatch.c
@@ -1,4 +1,4 @@
-const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.51 2009/05/16 13:27:20 fabiankeil Exp $";
+const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.69 2012/03/09 16:24:36 fabiankeil Exp $";
 /*********************************************************************
  *
  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
@@ -6,7 +6,7 @@ const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.51 2009/05/16 13:27:20 fabianke
  * Purpose     :  Declares functions to match URLs against URL
  *                patterns.
  *
- * Copyright   :  Written by and Copyright (C) 2001-2009
+ * Copyright   :  Written by and Copyright (C) 2001-2011
  *                the Privoxy team. http://www.privoxy.org/
  *
  *                Based on the Internet Junkbuster originally written
@@ -58,7 +58,13 @@ const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.51 2009/05/16 13:27:20 fabianke
 
 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 
-enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
+enum regex_anchoring
+{
+   NO_ANCHORING,
+   LEFT_ANCHORED,
+   RIGHT_ANCHORED,
+   RIGHT_ANCHORED_HOST
+};
 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
 
 /*********************************************************************
@@ -86,12 +92,15 @@ void free_http_request(struct http_request *http)
    freez(http->path);
    freez(http->ver);
    freez(http->host_ip_addr_str);
+#ifndef FEATURE_EXTENDED_HOST_PATTERNS
    freez(http->dbuffer);
    freez(http->dvec);
    http->dcount = 0;
+#endif
 }
 
 
+#ifndef FEATURE_EXTENDED_HOST_PATTERNS
 /*********************************************************************
  *
  * Function    :  init_domain_components
@@ -126,7 +135,7 @@ jb_err init_domain_components(struct http_request *http)
    /* map to lower case */
    for (p = http->dbuffer; *p ; p++)
    {
-      *p = (char)tolower((int)(unsigned char)*p);
+      *p = (char)privoxy_tolower(*p);
    }
 
    /* split the domain name into components */
@@ -155,6 +164,54 @@ jb_err init_domain_components(struct http_request *http)
 
    return JB_ERR_OK;
 }
+#endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
+
+
+/*********************************************************************
+ *
+ * Function    :  url_requires_percent_encoding
+ *
+ * Description :  Checks if an URL contains invalid characters
+ *                according to RFC 3986 that should be percent-encoded.
+ *                Does not verify whether or not the passed string
+ *                actually is a valid URL.
+ *
+ * Parameters  :
+ *          1  :  url = URL to check
+ *
+ * Returns     :  True in case of valid URLs, false otherwise
+ *
+ *********************************************************************/
+int url_requires_percent_encoding(const char *url)
+{
+   static const char allowed_characters[128] = {
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+      '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
+      '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
+      '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
+      '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
+      'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
+      'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
+      'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
+      'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
+      'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
+      'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
+   };
+
+   while (*url != '\0')
+   {
+      const unsigned int i = (unsigned char)*url++;
+      if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
+      {
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+
+}
 
 
 /*********************************************************************
@@ -193,11 +250,11 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
 
    /*
     * Check for * URI. If found, we're done.
-    */  
+    */
    if (*http->url == '*')
    {
-      if  ( NULL == (http->path = strdup("*"))
-         || NULL == (http->hostport = strdup("")) ) 
+      if (NULL == (http->path = strdup("*"))
+       || NULL == (http->hostport = strdup("")))
       {
          return JB_ERR_MEMORY;
       }
@@ -280,8 +337,8 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
 
       freez(buf);
 
-      if ( (http->path == NULL)
-        || (http->hostport == NULL))
+      if ((http->path == NULL)
+       || (http->hostport == NULL))
       {
          return JB_ERR_MEMORY;
       }
@@ -377,10 +434,12 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
       }
    }
 
-   /*
-    * Split domain name so we can compare it against wildcards
-    */
+#ifdef FEATURE_EXTENDED_HOST_PATTERNS
+   return JB_ERR_OK;
+#else
+   /* Split domain name so we can compare it against wildcards */
    return init_domain_components(http);
+#endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 
 }
 
@@ -399,7 +458,7 @@ jb_err parse_http_url(const char *url, struct http_request *http, int require_pr
  *********************************************************************/
 static int unknown_method(const char *method)
 {
-   static const char *known_http_methods[] = {
+   static const char * const known_http_methods[] = {
       /* Basic HTTP request type */
       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
       /* webDAV extensions (RFC2518) */
@@ -408,14 +467,14 @@ static int unknown_method(const char *method)
        * Microsoft webDAV extension for Exchange 2000.  See:
        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
-       */ 
+       */
       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
       /*
        * Another Microsoft webDAV extension for Exchange 2000.  See:
        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
-       */ 
+       */
       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
       /*
        * Yet another WebDAV extension, this time for
@@ -522,7 +581,7 @@ jb_err parse_http_request(const char *req, struct http_request *http)
 
    if ( (http->cmd == NULL)
      || (http->gpc == NULL)
-     || (http->ver == NULL) )
+     || (http->ver == NULL))
    {
       return JB_ERR_MEMORY;
    }
@@ -540,9 +599,10 @@ jb_err parse_http_request(const char *req, struct http_request *http)
  *
  * Parameters  :
  *          1  :  pattern = The pattern to compile.
- *          2  :  anchoring = How the regex should be anchored.
- *                            Can be either one of NO_ANCHORING,
- *                            LEFT_ANCHORED or RIGHT_ANCHORED.
+ *          2  :  anchoring = How the regex should be modified
+ *                            before compilation. Can be either
+ *                            one of NO_ANCHORING, LEFT_ANCHORED,
+ *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
  *          3  :  url     = In case of failures, the spec member is
  *                          logged and the structure freed.
  *          4  :  regex   = Where the compiled regex should be stored.
@@ -576,6 +636,9 @@ static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchorin
       case RIGHT_ANCHORED:
          fmt = "%s$";
          break;
+      case RIGHT_ANCHORED_HOST:
+         fmt = "%s\\.?$";
+         break;
       case LEFT_ANCHORED:
          fmt = "^%s";
          break;
@@ -712,7 +775,7 @@ static jb_err compile_url_pattern(struct url_spec *url, char *buf)
  *
  * Function    :  compile_host_pattern
  *
- * Description :  Parses and compiles a host pattern..
+ * Description :  Parses and compiles a host pattern.
  *
  * Parameters  :
  *          1  :  url = Target url_spec to be filled in.
@@ -725,7 +788,7 @@ static jb_err compile_url_pattern(struct url_spec *url, char *buf)
  *********************************************************************/
 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 {
-   return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
+   return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 }
 
 #else
@@ -763,7 +826,7 @@ static jb_err compile_host_pattern(struct url_spec *url, const char *host_patter
       url->unanchored |= ANCHOR_LEFT;
    }
 
-   /* 
+   /*
     * Split domain into components
     */
    url->dbuffer = strdup(host_pattern);
@@ -773,15 +836,15 @@ static jb_err compile_host_pattern(struct url_spec *url, const char *host_patter
       return JB_ERR_MEMORY;
    }
 
-   /* 
+   /*
     * Map to lower case
     */
    for (p = url->dbuffer; *p ; p++)
    {
-      *p = (char)tolower((int)(unsigned char)*p);
+      *p = (char)privoxy_tolower(*p);
    }
 
-   /* 
+   /*
     * Split the domain name into components
     */
    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
@@ -793,11 +856,11 @@ static jb_err compile_host_pattern(struct url_spec *url, const char *host_patter
    }
    else if (url->dcount != 0)
    {
-      /* 
+      /*
        * Save a copy of the pointers in dvec
        */
       size = (size_t)url->dcount * sizeof(*url->dvec);
-      
+
       url->dvec = (char **)malloc(size);
       if (NULL == url->dvec)
       {
@@ -836,13 +899,13 @@ static int simplematch(const char *pattern, const char *text)
 {
    const unsigned char *pat = (const unsigned char *)pattern;
    const unsigned char *txt = (const unsigned char *)text;
-   const unsigned char *fallback = pat; 
+   const unsigned char *fallback = pat;
    int wildcard = 0;
-  
+
    unsigned char lastchar = 'a';
    unsigned i;
    unsigned char charmap[32];
-  
+
    while (*txt)
    {
 
@@ -860,15 +923,15 @@ static int simplematch(const char *pattern, const char *text)
       }
 
       /* '*' in the pattern?  */
-      if (*pat == '*') 
+      if (*pat == '*')
       {
-     
+
          /* The pattern ends afterwards? Speed up the return. */
          if (*++pat == '\0')
          {
             return 0;
          }
-     
+
          /* Else, set wildcard mode and remember position after '*' */
          wildcard = 1;
          fallback = pat;
@@ -882,7 +945,7 @@ static int simplematch(const char *pattern, const char *text)
          while (*++pat != ']')
          {
             if (!*pat)
-            { 
+            {
                return 1;
             }
             else if (*pat == '-')
@@ -894,7 +957,7 @@ static int simplematch(const char *pattern, const char *text)
                for (i = lastchar; i <= *pat; i++)
                {
                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
-               } 
+               }
             }
             else
             {
@@ -905,21 +968,21 @@ static int simplematch(const char *pattern, const char *text)
       } /* -END- if Character range specification */
 
 
-      /* 
-       * Char match, or char range match? 
+      /*
+       * Char match, or char range match?
        */
-      if ( (*pat == *txt)
-      ||   (*pat == '?')
-      ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
+      if ((*pat == *txt)
+       || (*pat == '?')
+       || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
       {
-         /* 
-          * Sucess: Go ahead
+         /*
+          * Success: Go ahead
           */
          pat++;
       }
       else if (!wildcard)
       {
-         /* 
+         /*
           * No match && no wildcard: No luck
           */
          return 1;
@@ -946,7 +1009,7 @@ static int simplematch(const char *pattern, const char *text)
    }
 
    /* Cut off extra '*'s */
-   if(*pat == '*')  pat++;
+   if (*pat == '*') pat++;
 
    /* If this is the pattern's end, fine! */
    return(*pat);
@@ -1111,15 +1174,15 @@ jb_err create_url_spec(struct url_spec *url, char *buf)
       return JB_ERR_MEMORY;
    }
 
-   /* Is it tag pattern? */
-   if (0 == strncmpic("TAG:", url->spec, 4))
+   /* Is it a tag pattern? */
+   if (0 == strncmpic(url->spec, "TAG:", 4))
    {
       /* The pattern starts with the first character after "TAG:" */
       const char *tag_pattern = buf + 4;
       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
    }
 
-   /* If it isn't a tag pattern it must be a URL pattern. */
+   /* If it isn't a tag pattern it must be an URL pattern. */
    return compile_url_pattern(url, buf);
 }
 
@@ -1167,6 +1230,70 @@ void free_url_spec(struct url_spec *url)
 }
 
 
+/*********************************************************************
+ *
+ * Function    :  port_matches
+ *
+ * Description :  Compares a port against a port list.
+ *
+ * Parameters  :
+ *          1  :  port      = The port to check.
+ *          2  :  port_list = The list of port to compare with.
+ *
+ * Returns     :  TRUE for yes, FALSE otherwise.
+ *
+ *********************************************************************/
+static int port_matches(const int port, const char *port_list)
+{
+   return ((NULL == port_list) || match_portlist(port_list, port));
+}
+
+
+/*********************************************************************
+ *
+ * Function    :  host_matches
+ *
+ * Description :  Compares a host against a host pattern.
+ *
+ * Parameters  :
+ *          1  :  url = The URL to match
+ *          2  :  pattern = The URL pattern
+ *
+ * Returns     :  TRUE for yes, FALSE otherwise.
+ *
+ *********************************************************************/
+static int host_matches(const struct http_request *http,
+                        const struct url_spec *pattern)
+{
+#ifdef FEATURE_EXTENDED_HOST_PATTERNS
+   return ((NULL == pattern->host_regex)
+      || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
+#else
+   return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
+#endif
+}
+
+
+/*********************************************************************
+ *
+ * Function    :  path_matches
+ *
+ * Description :  Compares a path against a path pattern.
+ *
+ * Parameters  :
+ *          1  :  path = The path to match
+ *          2  :  pattern = The URL pattern
+ *
+ * Returns     :  TRUE for yes, FALSE otherwise.
+ *
+ *********************************************************************/
+static int path_matches(const char *path, const struct url_spec *pattern)
+{
+   return ((NULL == pattern->preg)
+      || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
+}
+
+
 /*********************************************************************
  *
  * Function    :  url_match
@@ -1183,22 +1310,14 @@ void free_url_spec(struct url_spec *url)
 int url_match(const struct url_spec *pattern,
               const struct http_request *http)
 {
-   /* XXX: these should probably be functions. */
-#define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
-#ifdef FEATURE_EXTENDED_HOST_PATTERNS
-#define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
-#else
-#define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
-#endif
-#define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
-
    if (pattern->tag_regex != NULL)
    {
       /* It's a tag pattern and shouldn't be matched against URLs */
       return 0;
-   } 
+   }
 
-   return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
+   return (port_matches(http->port, pattern->port_list)
+      && host_matches(http, pattern) && path_matches(http->path, pattern));
 
 }
 
@@ -1235,7 +1354,7 @@ int match_portlist(const char *portlist, int port)
    /*
     * Loop through all items, checking for match
     */
-   while(min)
+   while (NULL != min)
    {
       if (NULL == (max = strchr(min, (int) '-')))
       {
@@ -1255,7 +1374,7 @@ int match_portlist(const char *portlist, int port)
           * or, if max was omitted, between min and 65K
           */
          *max++ = '\0';
-         if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
+         if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
          {
             freez(portlist_copy);
             return(1);