urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.33 2008/04/12 14:03:13 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2003, 2006-2008 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.33  2008/04/12 14:03:13  fabiankeil
  37  *    Remove an obvious comment and improve another one.
  38  *
  39  *    Revision 1.32  2008/04/12 12:38:06  fabiankeil
  40  *    Factor out duplicated code to compile host, path and tag patterns.
  41  *
  42  *    Revision 1.31  2008/04/10 14:41:04  fabiankeil
  43  *    Ditch url_spec's path member now that it's no longer used.
  44  *
  45  *    Revision 1.30  2008/04/10 04:24:24  fabiankeil
  46  *    Stop duplicating the plain text representation of the path regex
  47  *    (and keeping the copy around). Once the regex is compiled it's no
  48  *    longer useful.
  49  *
  50  *    Revision 1.29  2008/04/10 04:17:56  fabiankeil
  51  *    In url_match(), check the right member for NULL when determining
  52  *    whether there's a path regex to execute. Looking for a plain-text
  53  *    representation works as well, but it looks "interesting" and that
  54  *    member will be removed soonish anyway.
  55  *
  56  *    Revision 1.28  2008/04/08 16:07:39  fabiankeil
  57  *    Make it harder to mistake url_match()'s
  58  *    second parameter for an url_spec.
  59  *
  60  *    Revision 1.27  2008/04/08 15:44:33  fabiankeil
  61  *    Save a bit of memory (and a few cpu cycles) by not bothering to
  62  *    compile slash-only path regexes that don't affect the result.
  63  *
  64  *    Revision 1.26  2008/04/07 16:57:18  fabiankeil
  65  *    - Use free_url_spec() more consistently.
  66  *    - Let it reset url->dcount just in case.
  67  *
  68  *    Revision 1.25  2008/04/06 15:18:38  fabiankeil
  69  *    Oh well, rename the --enable-pcre-host-patterns option to
  70  *    --enable-extended-host-patterns as it's not really PCRE syntax.
  71  *
  72  *    Revision 1.24  2008/04/06 14:54:26  fabiankeil
  73  *    Use PCRE syntax in host patterns when configured
  74  *    with --enable-pcre-host-patterns.
  75  *
  76  *    Revision 1.23  2008/04/05 12:19:20  fabiankeil
  77  *    Factor compile_host_pattern() out of create_url_spec().
  78  *
  79  *    Revision 1.22  2008/03/30 15:02:32  fabiankeil
  80  *    SZitify unknown_method().
  81  *
  82  *    Revision 1.21  2007/12/24 16:34:23  fabiankeil
  83  *    Band-aid (and micro-optimization) that makes it less likely to run out of
  84  *    stack space with overly-complex path patterns. Probably masks the problem
  85  *    reported by Lee in #1856679. Hohoho.
  86  *
  87  *    Revision 1.20  2007/09/02 15:31:20  fabiankeil
  88  *    Move match_portlist() from filter.c to urlmatch.c.
  89  *    It's used for url matching, not for filtering.
  90  *
  91  *    Revision 1.19  2007/09/02 13:42:11  fabiankeil
  92  *    - Allow port lists in url patterns.
  93  *    - Ditch unused url_spec member pathlen.
  94  *
  95  *    Revision 1.18  2007/07/30 16:42:21  fabiankeil
  96  *    Move the method check into unknown_method()
  97  *    and loop through the known methods instead
  98  *    of using a screen-long OR chain.
  99  *
 100  *    Revision 1.17  2007/04/15 16:39:21  fabiankeil
 101  *    Introduce tags as alternative way to specify which
 102  *    actions apply to a request. At the moment tags can be
 103  *    created based on client and server headers.
 104  *
 105  *    Revision 1.16  2007/02/13 13:59:24  fabiankeil
 106  *    Remove redundant log message.
 107  *
 108  *    Revision 1.15  2007/01/28 16:11:23  fabiankeil
 109  *    Accept WebDAV methods for subversion
 110  *    in parse_http_request(). Closes FR 1581425.
 111  *
 112  *    Revision 1.14  2007/01/06 14:23:56  fabiankeil
 113  *    Fix gcc43 warnings. Mark *csp as immutable
 114  *    for parse_http_url() and url_match().
 115  *    Replace a sprintf call with snprintf.
 116  *
 117  *    Revision 1.13  2006/12/06 19:50:54  fabiankeil
 118  *    parse_http_url() now handles intercepted
 119  *    HTTP request lines as well. Moved parts
 120  *    of parse_http_url()'s code into
 121  *    init_domain_components() so that it can
 122  *    be reused in chat().
 123  *
 124  *    Revision 1.12  2006/07/18 14:48:47  david__schmidt
 125  *    Reorganizing the repository: swapping out what was HEAD (the old 3.1 branch)
 126  *    with what was really the latest development (the v_3_0_branch branch)
 127  *
 128  *    Revision 1.10.2.7  2003/05/17 15:57:24  oes
 129  *     - parse_http_url now checks memory allocation failure for
 130  *       duplication of "*" URL and rejects "*something" URLs
 131  *       Closes bug #736344
 132  *     - Added a comment to what might look like a bug in
 133  *       create_url_spec (see !bug #736931)
 134  *     - Comment cosmetics
 135  *
 136  *    Revision 1.10.2.6  2003/05/07 12:39:48  oes
 137  *    Fix typo: Default port for https URLs is 443, not 143.
 138  *    Thanks to Scott Tregear for spotting this one.
 139  *
 140  *    Revision 1.10.2.5  2003/02/28 13:09:29  oes
 141  *    Fixed a rare double free condition as per Bug #694713
 142  *
 143  *    Revision 1.10.2.4  2003/02/28 12:57:44  oes
 144  *    Moved freeing of http request structure to its owner
 145  *    as per Dan Price's observations in Bug #694713
 146  *
 147  *    Revision 1.10.2.3  2002/11/12 16:50:40  oes
 148  *    Fixed memory leak in parse_http_request() reported by Oliver Stoeneberg. Fixes bug #637073
 149  *
 150  *    Revision 1.10.2.2  2002/09/25 14:53:15  oes
 151  *    Added basic support for OPTIONS and TRACE HTTP methods:
 152  *    parse_http_url now recognizes the "*" URI as well as
 153  *    the OPTIONS and TRACE method keywords.
 154  *
 155  *    Revision 1.10.2.1  2002/06/06 19:06:44  jongfoster
 156  *    Adding support for proprietary Microsoft WebDAV extensions
 157  *
 158  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
 159  *    - Removing some unused code
 160  *
 161  *    Revision 1.9  2002/04/04 00:36:36  gliptak
 162  *    always use pcre for matching
 163  *
 164  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
 165  *    Fixing memory leak on error
 166  *
 167  *    Revision 1.7  2002/03/26 22:29:55  swa
 168  *    we have a new homepage!
 169  *
 170  *    Revision 1.6  2002/03/24 13:25:43  swa
 171  *    name change related issues
 172  *
 173  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
 174  *    Killing warnings
 175  *
 176  *    Revision 1.4  2002/03/07 03:46:17  oes
 177  *    Fixed compiler warnings
 178  *
 179  *    Revision 1.3  2002/03/03 14:51:11  oes
 180  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
 181  *
 182  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
 183  *    Correcting comment style
 184  *    Fixing an uninitialized memory bug in create_url_spec()
 185  *
 186  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
 187  *    Moving all our URL and URL pattern parsing code to the same file - it
 188  *    was scattered around in filters.c, loaders.c and parsers.c.
 189  *
 190  *    Providing a single, simple url_match(pattern,url) function - rather than
 191  *    the 3-line match routine which was repeated all over the place.
 192  *
 193  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
 194  *
 195  *    Providing parse_http_url() so that URLs can be parsed without faking a
 196  *    HTTP request line for parse_http_request() or repeating the parsing
 197  *    code (both of which were techniques that were actually in use).
 198  *
 199  *    Standardizing that struct http_request is used to represent a URL, and
 200  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
 201  *    represented as seperate variables and a partially-filled-in url_spec).
 202  *
 203  *
 204  *********************************************************************/
 205 \f
 206
 207 #include "config.h"
 208
 209 #ifndef _WIN32
 210 #include <stdio.h>
 211 #include <sys/types.h>
 212 #endif
 213
 214 #include <stdlib.h>
 215 #include <ctype.h>
 216 #include <assert.h>
 217 #include <string.h>
 218
 219 #if !defined(_WIN32) && !defined(__OS2__)
 220 #include <unistd.h>
 221 #endif
 222
 223 #include "project.h"
 224 #include "urlmatch.h"
 225 #include "ssplit.h"
 226 #include "miscutil.h"
 227 #include "errlog.h"
 228
 229 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 230
 231 enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
 232 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
 233
 234 /*********************************************************************
 235  *
 236  * Function    :  free_http_request
 237  *
 238  * Description :  Freez a http_request structure
 239  *
 240  * Parameters  :
 241  *          1  :  http = points to a http_request structure to free
 242  *
 243  * Returns     :  N/A
 244  *
 245  *********************************************************************/
 246 void free_http_request(struct http_request *http)
 247 {
 248    assert(http);
 249
 250    freez(http->cmd);
 251    freez(http->ocmd);
 252    freez(http->gpc);
 253    freez(http->host);
 254    freez(http->url);
 255    freez(http->hostport);
 256    freez(http->path);
 257    freez(http->ver);
 258    freez(http->host_ip_addr_str);
 259    freez(http->dbuffer);
 260    freez(http->dvec);
 261    http->dcount = 0;
 262 }
 263
 264
 265 /*********************************************************************
 266  *
 267  * Function    :  init_domain_components
 268  *
 269  * Description :  Splits the domain name so we can compare it
 270  *                against wildcards. It used to be part of
 271  *                parse_http_url, but was separated because the
 272  *                same code is required in chat in case of
 273  *                intercepted requests.
 274  *
 275  * Parameters  :
 276  *          1  :  http = pointer to the http structure to hold elements.
 277  *
 278  * Returns     :  JB_ERR_OK on success
 279  *                JB_ERR_MEMORY on out of memory
 280  *                JB_ERR_PARSE on malformed command/URL
 281  *                             or >100 domains deep.
 282  *
 283  *********************************************************************/
 284 jb_err init_domain_components(struct http_request *http)
 285 {
 286    char *vec[BUFFER_SIZE];
 287    size_t size;
 288    char *p;
 289
 290    http->dbuffer = strdup(http->host);
 291    if (NULL == http->dbuffer)
 292    {
 293       return JB_ERR_MEMORY;
 294    }
 295
 296    /* map to lower case */
 297    for (p = http->dbuffer; *p ; p++)
 298    {
 299       *p = (char)tolower((int)(unsigned char)*p);
 300    }
 301
 302    /* split the domain name into components */
 303    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 304
 305    if (http->dcount <= 0)
 306    {
 307       /*
 308        * Error: More than SZ(vec) components in domain
 309        *    or: no components in domain
 310        */
 311       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 312       return JB_ERR_PARSE;
 313    }
 314
 315    /* save a copy of the pointers in dvec */
 316    size = (size_t)http->dcount * sizeof(*http->dvec);
 317
 318    http->dvec = (char **)malloc(size);
 319    if (NULL == http->dvec)
 320    {
 321       return JB_ERR_MEMORY;
 322    }
 323
 324    memcpy(http->dvec, vec, size);
 325
 326    return JB_ERR_OK;
 327 }
 328
 329
 330 /*********************************************************************
 331  *
 332  * Function    :  parse_http_url
 333  *
 334  * Description :  Parse out the host and port from the URL.  Find the
 335  *                hostname & path, port (if ':'), and/or password (if '@')
 336  *
 337  * Parameters  :
 338  *          1  :  url = URL (or is it URI?) to break down
 339  *          2  :  http = pointer to the http structure to hold elements.
 340  *                       Will be zeroed before use.  Note that this
 341  *                       function sets the http->gpc and http->ver
 342  *                       members to NULL.
 343  *          3  :  csp = Current client state (buffers, headers, etc...)
 344  *
 345  * Returns     :  JB_ERR_OK on success
 346  *                JB_ERR_MEMORY on out of memory
 347  *                JB_ERR_PARSE on malformed command/URL
 348  *                             or >100 domains deep.
 349  *
 350  *********************************************************************/
 351 jb_err parse_http_url(const char * url,
 352                       struct http_request *http,
 353                       const struct client_state *csp)
 354 {
 355    int host_available = 1; /* A proxy can dream. */
 356
 357    /*
 358     * Zero out the results structure
 359     */
 360    memset(http, '\0', sizeof(*http));
 361
 362
 363    /*
 364     * Save our initial URL
 365     */
 366    http->url = strdup(url);
 367    if (http->url == NULL)
 368    {
 369       return JB_ERR_MEMORY;
 370    }
 371
 372
 373    /*
 374     * Check for * URI. If found, we're done.
 375     */
 376    if (*http->url == '*')
 377    {
 378       if  ( NULL == (http->path = strdup("*"))
 379          || NULL == (http->hostport = strdup("")) )
 380       {
 381          return JB_ERR_MEMORY;
 382       }
 383       if (http->url[1] != '\0')
 384       {
 385          return JB_ERR_PARSE;
 386       }
 387       return JB_ERR_OK;
 388    }
 389
 390
 391    /*
 392     * Split URL into protocol,hostport,path.
 393     */
 394    {
 395       char *buf;
 396       char *url_noproto;
 397       char *url_path;
 398
 399       buf = strdup(url);
 400       if (buf == NULL)
 401       {
 402          return JB_ERR_MEMORY;
 403       }
 404
 405       /* Find the start of the URL in our scratch space */
 406       url_noproto = buf;
 407       if (strncmpic(url_noproto, "http://",  7) == 0)
 408       {
 409          url_noproto += 7;
 410          http->ssl = 0;
 411       }
 412       else if (strncmpic(url_noproto, "https://", 8) == 0)
 413       {
 414          url_noproto += 8;
 415          http->ssl = 1;
 416       }
 417       else if (*url_noproto == '/')
 418       {
 419         /*
 420          * Short request line without protocol and host.
 421          * Most likely because the client's request
 422          * was intercepted and redirected into Privoxy.
 423          */
 424          http->ssl = 0;
 425          http->host = NULL;
 426          host_available = 0;
 427       }
 428       else
 429       {
 430          http->ssl = 0;
 431       }
 432
 433       url_path = strchr(url_noproto, '/');
 434       if (url_path != NULL)
 435       {
 436          /*
 437           * Got a path.
 438           *
 439           * NOTE: The following line ignores the path for HTTPS URLS.
 440           * This means that you get consistent behaviour if you type a
 441           * https URL in and it's parsed by the function.  (When the
 442           * URL is actually retrieved, SSL hides the path part).
 443           */
 444          http->path = strdup(http->ssl ? "/" : url_path);
 445          *url_path = '\0';
 446          http->hostport = strdup(url_noproto);
 447       }
 448       else
 449       {
 450          /*
 451           * Repair broken HTTP requests that don't contain a path,
 452           * or CONNECT requests
 453           */
 454          http->path = strdup("/");
 455          http->hostport = strdup(url_noproto);
 456       }
 457
 458       freez(buf);
 459
 460       if ( (http->path == NULL)
 461         || (http->hostport == NULL))
 462       {
 463          return JB_ERR_MEMORY;
 464       }
 465    }
 466
 467    if (!host_available)
 468    {
 469       /* Without host, there is nothing left to do here */
 470       return JB_ERR_OK;
 471    }
 472
 473    /*
 474     * Split hostport into user/password (ignored), host, port.
 475     */
 476    {
 477       char *buf;
 478       char *host;
 479       char *port;
 480
 481       buf = strdup(http->hostport);
 482       if (buf == NULL)
 483       {
 484          return JB_ERR_MEMORY;
 485       }
 486
 487       /* check if url contains username and/or password */
 488       host = strchr(buf, '@');
 489       if (host != NULL)
 490       {
 491          /* Contains username/password, skip it and the @ sign. */
 492          host++;
 493       }
 494       else
 495       {
 496          /* No username or password. */
 497          host = buf;
 498       }
 499
 500       /* check if url contains port */
 501       port = strchr(host, ':');
 502       if (port != NULL)
 503       {
 504          /* Contains port */
 505          /* Terminate hostname and point to start of port string */
 506          *port++ = '\0';
 507          http->port = atoi(port);
 508       }
 509       else
 510       {
 511          /* No port specified. */
 512          http->port = (http->ssl ? 443 : 80);
 513       }
 514
 515       http->host = strdup(host);
 516
 517       free(buf);
 518
 519       if (http->host == NULL)
 520       {
 521          return JB_ERR_MEMORY;
 522       }
 523    }
 524
 525    /*
 526     * Split domain name so we can compare it against wildcards
 527     */
 528    return init_domain_components(http);
 529
 530 }
 531
 532
 533 /*********************************************************************
 534  *
 535  * Function    :  unknown_method
 536  *
 537  * Description :  Checks whether a method is unknown.
 538  *
 539  * Parameters  :
 540  *          1  :  method = points to a http method
 541  *
 542  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 543  *
 544  *********************************************************************/
 545 static int unknown_method(const char *method)
 546 {
 547    static const char *known_http_methods[] = {
 548       /* Basic HTTP request type */
 549       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 550       /* webDAV extensions (RFC2518) */
 551       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 552       /*
 553        * Microsoft webDAV extension for Exchange 2000.  See:
 554        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 555        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 556        */
 557       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 558       /*
 559        * Another Microsoft webDAV extension for Exchange 2000.  See:
 560        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 561        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 562        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 563        */
 564       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 565       /*
 566        * Yet another WebDAV extension, this time for
 567        * Web Distributed Authoring and Versioning (RFC3253)
 568        */
 569       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 570       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 571    };
 572    int i;
 573
 574    for (i = 0; i < SZ(known_http_methods); i++)
 575    {
 576       if (0 == strcmpic(method, known_http_methods[i]))
 577       {
 578          return FALSE;
 579       }
 580    }
 581
 582    return TRUE;
 583
 584 }
 585
 586
 587 /*********************************************************************
 588  *
 589  * Function    :  parse_http_request
 590  *
 591  * Description :  Parse out the host and port from the URL.  Find the
 592  *                hostname & path, port (if ':'), and/or password (if '@')
 593  *
 594  * Parameters  :
 595  *          1  :  req = HTTP request line to break down
 596  *          2  :  http = pointer to the http structure to hold elements
 597  *          3  :  csp = Current client state (buffers, headers, etc...)
 598  *
 599  * Returns     :  JB_ERR_OK on success
 600  *                JB_ERR_MEMORY on out of memory
 601  *                JB_ERR_CGI_PARAMS on malformed command/URL
 602  *                                  or >100 domains deep.
 603  *
 604  *********************************************************************/
 605 jb_err parse_http_request(const char *req,
 606                           struct http_request *http,
 607                           const struct client_state *csp)
 608 {
 609    char *buf;
 610    char *v[10]; /* XXX: Why 10? We should only need three. */
 611    int n;
 612    jb_err err;
 613    int is_connect = 0;
 614
 615    memset(http, '\0', sizeof(*http));
 616
 617    buf = strdup(req);
 618    if (buf == NULL)
 619    {
 620       return JB_ERR_MEMORY;
 621    }
 622
 623    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 624    if (n != 3)
 625    {
 626       free(buf);
 627       return JB_ERR_PARSE;
 628    }
 629
 630    /*
 631     * Fail in case of unknown methods
 632     * which we might not handle correctly.
 633     *
 634     * XXX: There should be a config option
 635     * to forward requests with unknown methods
 636     * anyway. Most of them don't need special
 637     * steps.
 638     */
 639    if (unknown_method(v[0]))
 640    {
 641       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 642       free(buf);
 643       return JB_ERR_PARSE;
 644    }
 645
 646    if (strcmpic(v[0], "CONNECT") == 0)
 647    {
 648       is_connect = 1;
 649    }
 650
 651    err = parse_http_url(v[1], http, csp);
 652    if (err)
 653    {
 654       free(buf);
 655       return err;
 656    }
 657
 658    /*
 659     * Copy the details into the structure
 660     */
 661    http->ssl = is_connect;
 662    http->cmd = strdup(req);
 663    http->gpc = strdup(v[0]);
 664    http->ver = strdup(v[2]);
 665
 666    if ( (http->cmd == NULL)
 667      || (http->gpc == NULL)
 668      || (http->ver == NULL) )
 669    {
 670       free(buf);
 671       return JB_ERR_MEMORY;
 672    }
 673
 674    free(buf);
 675    return JB_ERR_OK;
 676
 677 }
 678
 679
 680 /*********************************************************************
 681  *
 682  * Function    :  compile_pattern
 683  *
 684  * Description :  Compiles a host, domain or TAG pattern.
 685  *
 686  * Parameters  :
 687  *          1  :  pattern = The pattern to compile.
 688  *          2  :  anchoring = How the regex should be anchored.
 689  *                            Can be either one of NO_ANCHORING,
 690  *                            LEFT_ANCHORED or RIGHT_ANCHORED.
 691  *          3  :  url     = In case of failures, the spec member is
 692  *                          logged and the structure freed.
 693  *          4  :  regex   = Where the compiled regex should be stored.
 694  *
 695  * Returns     :  JB_ERR_OK - Success
 696  *                JB_ERR_MEMORY - Out of memory
 697  *                JB_ERR_PARSE - Cannot parse regex
 698  *
 699  *********************************************************************/
 700 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 701                               struct url_spec *url, regex_t **regex)
 702 {
 703    int errcode;
 704    char rebuf[BUFFER_SIZE];
 705    const char *fmt;
 706
 707    assert(pattern);
 708    assert(strlen(pattern) < sizeof(rebuf) - 2);
 709
 710    if (pattern[0] == '\0')
 711    {
 712       *regex = NULL;
 713       return JB_ERR_OK;
 714    }
 715
 716    switch (anchoring)
 717    {
 718       case NO_ANCHORING:
 719          fmt = "%s";
 720          break;
 721       case RIGHT_ANCHORED:
 722          fmt = "%s$";
 723          break;
 724       case LEFT_ANCHORED:
 725          fmt = "^%s";
 726          break;
 727       default:
 728          log_error(LOG_LEVEL_FATAL,
 729             "Invalid anchoring in compile_pattern %d", anchoring);
 730    }
 731
 732    *regex = zalloc(sizeof(**regex));
 733    if (NULL == *regex)
 734    {
 735       free_url_spec(url);
 736       return JB_ERR_MEMORY;
 737    }
 738
 739    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 740
 741    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 742
 743    if (errcode)
 744    {
 745       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 746       if (errlen > (sizeof(rebuf) - (size_t)1))
 747       {
 748          errlen = sizeof(rebuf) - (size_t)1;
 749       }
 750       rebuf[errlen] = '\0';
 751       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 752          pattern, url->spec, rebuf);
 753       free_url_spec(url);
 754
 755       return JB_ERR_PARSE;
 756    }
 757
 758    return JB_ERR_OK;
 759
 760 }
 761
 762
 763 /*********************************************************************
 764  *
 765  * Function    :  compile_url_pattern
 766  *
 767  * Description :  Compiles the three parts of an URL pattern.
 768  *
 769  * Parameters  :
 770  *          1  :  url = Target url_spec to be filled in.
 771  *          2  :  buf = The url pattern to compile. Will be messed up.
 772  *
 773  * Returns     :  JB_ERR_OK - Success
 774  *                JB_ERR_MEMORY - Out of memory
 775  *                JB_ERR_PARSE - Cannot parse regex
 776  *
 777  *********************************************************************/
 778 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 779 {
 780    char *p;
 781
 782    p = strchr(buf, '/');
 783    if (NULL != p)
 784    {
 785       /*
 786        * Only compile the regex if it consists of more than
 787        * a single slash, otherwise it wouldn't affect the result.
 788        */
 789       if (p[1] != '\0')
 790       {
 791          /*
 792           * XXX: does it make sense to compile the slash at the beginning?
 793           */
 794          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 795
 796          if (JB_ERR_OK != err)
 797          {
 798             return err;
 799          }
 800       }
 801       *p = '\0';
 802    }
 803
 804    p = strchr(buf, ':');
 805    if (NULL != p)
 806    {
 807       *p++ = '\0';
 808       url->port_list = strdup(p);
 809       if (NULL == url->port_list)
 810       {
 811          return JB_ERR_MEMORY;
 812       }
 813    }
 814    else
 815    {
 816       url->port_list = NULL;
 817    }
 818
 819    if (buf[0] != '\0')
 820    {
 821       return compile_host_pattern(url, buf);
 822    }
 823
 824    return JB_ERR_OK;
 825
 826 }
 827
 828
 829 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 830 /*********************************************************************
 831  *
 832  * Function    :  compile_host_pattern
 833  *
 834  * Description :  Parses and compiles a host pattern..
 835  *
 836  * Parameters  :
 837  *          1  :  url = Target url_spec to be filled in.
 838  *          2  :  host_pattern = Host pattern to compile.
 839  *
 840  * Returns     :  JB_ERR_OK - Success
 841  *                JB_ERR_MEMORY - Out of memory
 842  *                JB_ERR_PARSE - Cannot parse regex
 843  *
 844  *********************************************************************/
 845 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 846 {
 847    return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
 848 }
 849
 850 #else
 851
 852 /*********************************************************************
 853  *
 854  * Function    :  compile_host_pattern
 855  *
 856  * Description :  Parses and "compiles" an old-school host pattern.
 857  *
 858  * Parameters  :
 859  *          1  :  url = Target url_spec to be filled in.
 860  *          2  :  host_pattern = Host pattern to parse.
 861  *
 862  * Returns     :  JB_ERR_OK - Success
 863  *                JB_ERR_MEMORY - Out of memory
 864  *                JB_ERR_PARSE - Cannot parse regex
 865  *
 866  *********************************************************************/
 867 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 868 {
 869    char *v[150];
 870    size_t size;
 871    char *p;
 872
 873    /*
 874     * Parse domain part
 875     */
 876    if (host_pattern[strlen(host_pattern) - 1] == '.')
 877    {
 878       url->unanchored |= ANCHOR_RIGHT;
 879    }
 880    if (host_pattern[0] == '.')
 881    {
 882       url->unanchored |= ANCHOR_LEFT;
 883    }
 884
 885    /*
 886     * Split domain into components
 887     */
 888    url->dbuffer = strdup(host_pattern);
 889    if (NULL == url->dbuffer)
 890    {
 891       free_url_spec(url);
 892       return JB_ERR_MEMORY;
 893    }
 894
 895    /*
 896     * Map to lower case
 897     */
 898    for (p = url->dbuffer; *p ; p++)
 899    {
 900       *p = (char)tolower((int)(unsigned char)*p);
 901    }
 902
 903    /*
 904     * Split the domain name into components
 905     */
 906    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 907
 908    if (url->dcount < 0)
 909    {
 910       free_url_spec(url);
 911       return JB_ERR_MEMORY;
 912    }
 913    else if (url->dcount != 0)
 914    {
 915       /*
 916        * Save a copy of the pointers in dvec
 917        */
 918       size = (size_t)url->dcount * sizeof(*url->dvec);
 919
 920       url->dvec = (char **)malloc(size);
 921       if (NULL == url->dvec)
 922       {
 923          free_url_spec(url);
 924          return JB_ERR_MEMORY;
 925       }
 926
 927       memcpy(url->dvec, v, size);
 928    }
 929    /*
 930     * else dcount == 0 in which case we needn't do anything,
 931     * since dvec will never be accessed and the pattern will
 932     * match all domains.
 933     */
 934    return JB_ERR_OK;
 935 }
 936
 937
 938 /*********************************************************************
 939  *
 940  * Function    :  simple_domaincmp
 941  *
 942  * Description :  Domain-wise Compare fqdn's.  The comparison is
 943  *                both left- and right-anchored.  The individual
 944  *                domain names are compared with simplematch().
 945  *                This is only used by domain_match.
 946  *
 947  * Parameters  :
 948  *          1  :  pv = array of patterns to compare
 949  *          2  :  fv = array of domain components to compare
 950  *          3  :  len = length of the arrays (both arrays are the
 951  *                      same length - if they weren't, it couldn't
 952  *                      possibly be a match).
 953  *
 954  * Returns     :  0 => domains are equivalent, else no match.
 955  *
 956  *********************************************************************/
 957 static int simple_domaincmp(char **pv, char **fv, int len)
 958 {
 959    int n;
 960
 961    for (n = 0; n < len; n++)
 962    {
 963       if (simplematch(pv[n], fv[n]))
 964       {
 965          return 1;
 966       }
 967    }
 968
 969    return 0;
 970
 971 }
 972
 973
 974 /*********************************************************************
 975  *
 976  * Function    :  domain_match
 977  *
 978  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 979  *                pattern->unachored, the comparison is un-, left-,
 980  *                right-anchored, or both.
 981  *                The individual domain names are compared with
 982  *                simplematch().
 983  *
 984  * Parameters  :
 985  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 986  *          2  :  fqdn = domain name against which the patterns are compared.
 987  *
 988  * Returns     :  0 => domains are equivalent, else no match.
 989  *
 990  *********************************************************************/
 991 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 992 {
 993    char **pv, **fv;  /* vectors  */
 994    int    plen, flen;
 995    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 996
 997    plen = pattern->dcount;
 998    flen = fqdn->dcount;
 999
1000    if (flen < plen)
1001    {
1002       /* fqdn is too short to match this pattern */
1003       return 1;
1004    }
1005
1006    pv   = pattern->dvec;
1007    fv   = fqdn->dvec;
1008
1009    if (unanchored == ANCHOR_LEFT)
1010    {
1011       /*
1012        * Right anchored.
1013        *
1014        * Convert this into a fully anchored pattern with
1015        * the fqdn and pattern the same length
1016        */
1017       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1018       return simple_domaincmp(pv, fv, plen);
1019    }
1020    else if (unanchored == 0)
1021    {
1022       /* Fully anchored, check length */
1023       if (flen != plen)
1024       {
1025          return 1;
1026       }
1027       return simple_domaincmp(pv, fv, plen);
1028    }
1029    else if (unanchored == ANCHOR_RIGHT)
1030    {
1031       /* Left anchored, ignore all extra in fqdn */
1032       return simple_domaincmp(pv, fv, plen);
1033    }
1034    else
1035    {
1036       /* Unanchored */
1037       int n;
1038       int maxn = flen - plen;
1039       for (n = 0; n <= maxn; n++)
1040       {
1041          if (!simple_domaincmp(pv, fv, plen))
1042          {
1043             return 0;
1044          }
1045          /*
1046           * Doesn't match from start of fqdn
1047           * Try skipping first part of fqdn
1048           */
1049          fv++;
1050       }
1051       return 1;
1052    }
1053
1054 }
1055 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1056
1057
1058 /*********************************************************************
1059  *
1060  * Function    :  create_url_spec
1061  *
1062  * Description :  Creates a "url_spec" structure from a string.
1063  *                When finished, free with free_url_spec().
1064  *
1065  * Parameters  :
1066  *          1  :  url = Target url_spec to be filled in.  Will be
1067  *                      zeroed before use.
1068  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1069  *                      contents of this buffer are destroyed by this
1070  *                      function.  If this function succeeds, the
1071  *                      buffer is copied to url->spec.  If this
1072  *                      function fails, the contents of the buffer
1073  *                      are lost forever. XXX: Why is this const?
1074  *
1075  * Returns     :  JB_ERR_OK - Success
1076  *                JB_ERR_MEMORY - Out of memory
1077  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1078  *                               written to system log)
1079  *
1080  *********************************************************************/
1081 jb_err create_url_spec(struct url_spec * url, const char * buf)
1082 {
1083    assert(url);
1084    assert(buf);
1085
1086    memset(url, '\0', sizeof(*url));
1087
1088    /* Remember the original specification for the CGI pages. */
1089    url->spec = strdup(buf);
1090    if (NULL == url->spec)
1091    {
1092       return JB_ERR_MEMORY;
1093    }
1094
1095    /* Is it tag pattern? */
1096    if (0 == strncmpic("TAG:", url->spec, 4))
1097    {
1098       /* The pattern starts with the first character after "TAG:" */
1099       const char *tag_pattern = buf + 4;
1100       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1101    }
1102
1103    /* If it isn't a tag pattern it must be a URL pattern. */
1104    return compile_url_pattern(url, (char *)buf);
1105 }
1106
1107
1108 /*********************************************************************
1109  *
1110  * Function    :  free_url_spec
1111  *
1112  * Description :  Called from the "unloaders".  Freez the url
1113  *                structure elements.
1114  *
1115  * Parameters  :
1116  *          1  :  url = pointer to a url_spec structure.
1117  *
1118  * Returns     :  N/A
1119  *
1120  *********************************************************************/
1121 void free_url_spec(struct url_spec *url)
1122 {
1123    if (url == NULL) return;
1124
1125    freez(url->spec);
1126 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1127    if (url->host_regex)
1128    {
1129       regfree(url->host_regex);
1130       freez(url->host_regex);
1131    }
1132 #else
1133    freez(url->dbuffer);
1134    freez(url->dvec);
1135    url->dcount = 0;
1136 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1137    freez(url->port_list);
1138    if (url->preg)
1139    {
1140       regfree(url->preg);
1141       freez(url->preg);
1142    }
1143    if (url->tag_regex)
1144    {
1145       regfree(url->tag_regex);
1146       freez(url->tag_regex);
1147    }
1148 }
1149
1150
1151 /*********************************************************************
1152  *
1153  * Function    :  url_match
1154  *
1155  * Description :  Compare a URL against a URL pattern.
1156  *
1157  * Parameters  :
1158  *          1  :  pattern = a URL pattern
1159  *          2  :  url = URL to match
1160  *
1161  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1162  *
1163  *********************************************************************/
1164 int url_match(const struct url_spec *pattern,
1165               const struct http_request *http)
1166 {
1167    /* XXX: these should probably be functions. */
1168 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
1169 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1170 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
1171 #else
1172 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
1173 #endif
1174 #define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
1175
1176    if (pattern->tag_regex != NULL)
1177    {
1178       /* It's a tag pattern and shouldn't be matched against URLs */
1179       return 0;
1180    }
1181
1182    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1183
1184 }
1185
1186
1187 /*********************************************************************
1188  *
1189  * Function    :  match_portlist
1190  *
1191  * Description :  Check if a given number is covered by a comma
1192  *                separated list of numbers and ranges (a,b-c,d,..)
1193  *
1194  * Parameters  :
1195  *          1  :  portlist = String with list
1196  *          2  :  port = port to check
1197  *
1198  * Returns     :  0 => no match
1199  *                1 => match
1200  *
1201  *********************************************************************/
1202 int match_portlist(const char *portlist, int port)
1203 {
1204    char *min, *max, *next, *portlist_copy;
1205
1206    min = next = portlist_copy = strdup(portlist);
1207
1208    /*
1209     * Zero-terminate first item and remember offset for next
1210     */
1211    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1212    {
1213       *next++ = '\0';
1214    }
1215
1216    /*
1217     * Loop through all items, checking for match
1218     */
1219    while(min)
1220    {
1221       if (NULL == (max = strchr(min, (int) '-')))
1222       {
1223          /*
1224           * No dash, check for equality
1225           */
1226          if (port == atoi(min))
1227          {
1228             free(portlist_copy);
1229             return(1);
1230          }
1231       }
1232       else
1233       {
1234          /*
1235           * This is a range, so check if between min and max,
1236           * or, if max was omitted, between min and 65K
1237           */
1238          *max++ = '\0';
1239          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1240          {
1241             free(portlist_copy);
1242             return(1);
1243          }
1244
1245       }
1246
1247       /*
1248        * Jump to next item
1249        */
1250       min = next;
1251
1252       /*
1253        * Zero-terminate next item and remember offset for n+1
1254        */
1255       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1256       {
1257          *next++ = '\0';
1258       }
1259    }
1260
1261    free(portlist_copy);
1262    return 0;
1263
1264 }
1265
1266
1267 /*
1268   Local Variables:
1269   tab-width: 3
1270   end:
1271 */