-const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.57 2009/06/03 16:44:15 fabiankeil Exp $";
+const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.64 2011/11/06 11:41:05 fabiankeil Exp $";
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
* Purpose : Declares functions to match URLs against URL
* patterns.
*
- * Copyright : Written by and Copyright (C) 2001-2009
+ * Copyright : Written by and Copyright (C) 2001-2011
* the Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
-enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
+enum regex_anchoring
+{
+ NO_ANCHORING,
+ LEFT_ANCHORED,
+ RIGHT_ANCHORED,
+ RIGHT_ANCHORED_HOST
+};
static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
/*********************************************************************
#endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
+/*********************************************************************
+ *
+ * Function : url_requires_percent_encoding
+ *
+ * Description : Checks if an URL contains invalid characters
+ * according to RFC 3986 that should be percent-encoded.
+ * Does not verify whether or not the passed string
+ * actually is a valid URL.
+ *
+ * Parameters :
+ * 1 : url = URL to check
+ *
+ * Returns : True in case of valid URLs, false otherwise
+ *
+ *********************************************************************/
+int url_requires_percent_encoding(const char *url)
+{
+ static const char allowed_characters[128] = {
+ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+ '\0', '\0', '\0', '!', '\0', '#', '$', '%', '&', '\'',
+ '(', ')', '*', '+', ',', '-', '.', '/', '0', '1',
+ '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
+ '\0', '=', '\0', '?', '@', 'A', 'B', 'C', 'D', 'E',
+ 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
+ 'Z', '[', '\0', ']', '\0', '_', '\0', 'a', 'b', 'c',
+ 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+ 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+ 'x', 'y', 'z', '\0', '\0', '\0', '~', '\0'
+ };
+
+ while (*url != '\0')
+ {
+ const unsigned int i = (unsigned char)*url++;
+ if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
+ {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+
+}
+
+
/*********************************************************************
*
* Function : parse_http_url
/*
* Check for * URI. If found, we're done.
- */
+ */
if (*http->url == '*')
{
if ( NULL == (http->path = strdup("*"))
- || NULL == (http->hostport = strdup("")) )
+ || NULL == (http->hostport = strdup("")) )
{
return JB_ERR_MEMORY;
}
*********************************************************************/
static int unknown_method(const char *method)
{
- static const char *known_http_methods[] = {
+ static const char * const known_http_methods[] = {
/* Basic HTTP request type */
"GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
/* webDAV extensions (RFC2518) */
* Microsoft webDAV extension for Exchange 2000. See:
* http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
* http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
- */
+ */
"BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
/*
* Another Microsoft webDAV extension for Exchange 2000. See:
* http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
* http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
* http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
- */
+ */
"SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
/*
* Yet another WebDAV extension, this time for
*
* Parameters :
* 1 : pattern = The pattern to compile.
- * 2 : anchoring = How the regex should be anchored.
- * Can be either one of NO_ANCHORING,
- * LEFT_ANCHORED or RIGHT_ANCHORED.
+ * 2 : anchoring = How the regex should be modified
+ * before compilation. Can be either
+ * one of NO_ANCHORING, LEFT_ANCHORED,
+ * RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
* 3 : url = In case of failures, the spec member is
* logged and the structure freed.
* 4 : regex = Where the compiled regex should be stored.
case RIGHT_ANCHORED:
fmt = "%s$";
break;
+ case RIGHT_ANCHORED_HOST:
+ fmt = "%s\\.?$";
+ break;
case LEFT_ANCHORED:
fmt = "^%s";
break;
*********************************************************************/
static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
{
- return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
+ return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
}
#else
url->unanchored |= ANCHOR_LEFT;
}
- /*
+ /*
* Split domain into components
*/
url->dbuffer = strdup(host_pattern);
return JB_ERR_MEMORY;
}
- /*
+ /*
* Map to lower case
*/
for (p = url->dbuffer; *p ; p++)
*p = (char)tolower((int)(unsigned char)*p);
}
- /*
+ /*
* Split the domain name into components
*/
url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
}
else if (url->dcount != 0)
{
- /*
+ /*
* Save a copy of the pointers in dvec
*/
size = (size_t)url->dcount * sizeof(*url->dvec);
-
+
url->dvec = (char **)malloc(size);
if (NULL == url->dvec)
{
{
const unsigned char *pat = (const unsigned char *)pattern;
const unsigned char *txt = (const unsigned char *)text;
- const unsigned char *fallback = pat;
+ const unsigned char *fallback = pat;
int wildcard = 0;
-
+
unsigned char lastchar = 'a';
unsigned i;
unsigned char charmap[32];
-
+
while (*txt)
{
}
/* '*' in the pattern? */
- if (*pat == '*')
+ if (*pat == '*')
{
-
+
/* The pattern ends afterwards? Speed up the return. */
if (*++pat == '\0')
{
return 0;
}
-
+
/* Else, set wildcard mode and remember position after '*' */
wildcard = 1;
fallback = pat;
while (*++pat != ']')
{
if (!*pat)
- {
+ {
return 1;
}
else if (*pat == '-')
for (i = lastchar; i <= *pat; i++)
{
charmap[i / 8] |= (unsigned char)(1 << (i % 8));
- }
+ }
}
else
{
} /* -END- if Character range specification */
- /*
- * Char match, or char range match?
+ /*
+ * Char match, or char range match?
*/
if ( (*pat == *txt)
|| (*pat == '?')
|| ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
{
- /*
- * Sucess: Go ahead
+ /*
+ * Success: Go ahead
*/
pat++;
}
else if (!wildcard)
{
- /*
+ /*
* No match && no wildcard: No luck
*/
return 1;
}
/* Is it a tag pattern? */
- if (0 == strncmpic("TAG:", url->spec, 4))
+ if (0 == strncmpic(url->spec, "TAG:", 4))
{
/* The pattern starts with the first character after "TAG:" */
const char *tag_pattern = buf + 4;
{
/* It's a tag pattern and shouldn't be matched against URLs */
return 0;
- }
+ }
return (port_matches(http->port, pattern->port_list)
&& host_matches(http, pattern) && path_matches(http->path, pattern));