urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.30 2008/04/10 04:24:24 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2003, 2006-2008 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.30  2008/04/10 04:24:24  fabiankeil
  37  *    Stop duplicating the plain text representation of the path regex
  38  *    (and keeping the copy around). Once the regex is compiled it's no
  39  *    longer useful.
  40  *
  41  *    Revision 1.29  2008/04/10 04:17:56  fabiankeil
  42  *    In url_match(), check the right member for NULL when determining
  43  *    whether there's a path regex to execute. Looking for a plain-text
  44  *    representation works as well, but it looks "interesting" and that
  45  *    member will be removed soonish anyway.
  46  *
  47  *    Revision 1.28  2008/04/08 16:07:39  fabiankeil
  48  *    Make it harder to mistake url_match()'s
  49  *    second parameter for an url_spec.
  50  *
  51  *    Revision 1.27  2008/04/08 15:44:33  fabiankeil
  52  *    Save a bit of memory (and a few cpu cycles) by not bothering to
  53  *    compile slash-only path regexes that don't affect the result.
  54  *
  55  *    Revision 1.26  2008/04/07 16:57:18  fabiankeil
  56  *    - Use free_url_spec() more consistently.
  57  *    - Let it reset url->dcount just in case.
  58  *
  59  *    Revision 1.25  2008/04/06 15:18:38  fabiankeil
  60  *    Oh well, rename the --enable-pcre-host-patterns option to
  61  *    --enable-extended-host-patterns as it's not really PCRE syntax.
  62  *
  63  *    Revision 1.24  2008/04/06 14:54:26  fabiankeil
  64  *    Use PCRE syntax in host patterns when configured
  65  *    with --enable-pcre-host-patterns.
  66  *
  67  *    Revision 1.23  2008/04/05 12:19:20  fabiankeil
  68  *    Factor compile_host_pattern() out of create_url_spec().
  69  *
  70  *    Revision 1.22  2008/03/30 15:02:32  fabiankeil
  71  *    SZitify unknown_method().
  72  *
  73  *    Revision 1.21  2007/12/24 16:34:23  fabiankeil
  74  *    Band-aid (and micro-optimization) that makes it less likely to run out of
  75  *    stack space with overly-complex path patterns. Probably masks the problem
  76  *    reported by Lee in #1856679. Hohoho.
  77  *
  78  *    Revision 1.20  2007/09/02 15:31:20  fabiankeil
  79  *    Move match_portlist() from filter.c to urlmatch.c.
  80  *    It's used for url matching, not for filtering.
  81  *
  82  *    Revision 1.19  2007/09/02 13:42:11  fabiankeil
  83  *    - Allow port lists in url patterns.
  84  *    - Ditch unused url_spec member pathlen.
  85  *
  86  *    Revision 1.18  2007/07/30 16:42:21  fabiankeil
  87  *    Move the method check into unknown_method()
  88  *    and loop through the known methods instead
  89  *    of using a screen-long OR chain.
  90  *
  91  *    Revision 1.17  2007/04/15 16:39:21  fabiankeil
  92  *    Introduce tags as alternative way to specify which
  93  *    actions apply to a request. At the moment tags can be
  94  *    created based on client and server headers.
  95  *
  96  *    Revision 1.16  2007/02/13 13:59:24  fabiankeil
  97  *    Remove redundant log message.
  98  *
  99  *    Revision 1.15  2007/01/28 16:11:23  fabiankeil
 100  *    Accept WebDAV methods for subversion
 101  *    in parse_http_request(). Closes FR 1581425.
 102  *
 103  *    Revision 1.14  2007/01/06 14:23:56  fabiankeil
 104  *    Fix gcc43 warnings. Mark *csp as immutable
 105  *    for parse_http_url() and url_match().
 106  *    Replace a sprintf call with snprintf.
 107  *
 108  *    Revision 1.13  2006/12/06 19:50:54  fabiankeil
 109  *    parse_http_url() now handles intercepted
 110  *    HTTP request lines as well. Moved parts
 111  *    of parse_http_url()'s code into
 112  *    init_domain_components() so that it can
 113  *    be reused in chat().
 114  *
 115  *    Revision 1.12  2006/07/18 14:48:47  david__schmidt
 116  *    Reorganizing the repository: swapping out what was HEAD (the old 3.1 branch)
 117  *    with what was really the latest development (the v_3_0_branch branch)
 118  *
 119  *    Revision 1.10.2.7  2003/05/17 15:57:24  oes
 120  *     - parse_http_url now checks memory allocation failure for
 121  *       duplication of "*" URL and rejects "*something" URLs
 122  *       Closes bug #736344
 123  *     - Added a comment to what might look like a bug in
 124  *       create_url_spec (see !bug #736931)
 125  *     - Comment cosmetics
 126  *
 127  *    Revision 1.10.2.6  2003/05/07 12:39:48  oes
 128  *    Fix typo: Default port for https URLs is 443, not 143.
 129  *    Thanks to Scott Tregear for spotting this one.
 130  *
 131  *    Revision 1.10.2.5  2003/02/28 13:09:29  oes
 132  *    Fixed a rare double free condition as per Bug #694713
 133  *
 134  *    Revision 1.10.2.4  2003/02/28 12:57:44  oes
 135  *    Moved freeing of http request structure to its owner
 136  *    as per Dan Price's observations in Bug #694713
 137  *
 138  *    Revision 1.10.2.3  2002/11/12 16:50:40  oes
 139  *    Fixed memory leak in parse_http_request() reported by Oliver Stoeneberg. Fixes bug #637073
 140  *
 141  *    Revision 1.10.2.2  2002/09/25 14:53:15  oes
 142  *    Added basic support for OPTIONS and TRACE HTTP methods:
 143  *    parse_http_url now recognizes the "*" URI as well as
 144  *    the OPTIONS and TRACE method keywords.
 145  *
 146  *    Revision 1.10.2.1  2002/06/06 19:06:44  jongfoster
 147  *    Adding support for proprietary Microsoft WebDAV extensions
 148  *
 149  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
 150  *    - Removing some unused code
 151  *
 152  *    Revision 1.9  2002/04/04 00:36:36  gliptak
 153  *    always use pcre for matching
 154  *
 155  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
 156  *    Fixing memory leak on error
 157  *
 158  *    Revision 1.7  2002/03/26 22:29:55  swa
 159  *    we have a new homepage!
 160  *
 161  *    Revision 1.6  2002/03/24 13:25:43  swa
 162  *    name change related issues
 163  *
 164  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
 165  *    Killing warnings
 166  *
 167  *    Revision 1.4  2002/03/07 03:46:17  oes
 168  *    Fixed compiler warnings
 169  *
 170  *    Revision 1.3  2002/03/03 14:51:11  oes
 171  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
 172  *
 173  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
 174  *    Correcting comment style
 175  *    Fixing an uninitialized memory bug in create_url_spec()
 176  *
 177  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
 178  *    Moving all our URL and URL pattern parsing code to the same file - it
 179  *    was scattered around in filters.c, loaders.c and parsers.c.
 180  *
 181  *    Providing a single, simple url_match(pattern,url) function - rather than
 182  *    the 3-line match routine which was repeated all over the place.
 183  *
 184  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
 185  *
 186  *    Providing parse_http_url() so that URLs can be parsed without faking a
 187  *    HTTP request line for parse_http_request() or repeating the parsing
 188  *    code (both of which were techniques that were actually in use).
 189  *
 190  *    Standardizing that struct http_request is used to represent a URL, and
 191  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
 192  *    represented as seperate variables and a partially-filled-in url_spec).
 193  *
 194  *
 195  *********************************************************************/
 196 \f
 197
 198 #include "config.h"
 199
 200 #ifndef _WIN32
 201 #include <stdio.h>
 202 #include <sys/types.h>
 203 #endif
 204
 205 #include <stdlib.h>
 206 #include <ctype.h>
 207 #include <assert.h>
 208 #include <string.h>
 209
 210 #if !defined(_WIN32) && !defined(__OS2__)
 211 #include <unistd.h>
 212 #endif
 213
 214 #include "project.h"
 215 #include "urlmatch.h"
 216 #include "ssplit.h"
 217 #include "miscutil.h"
 218 #include "errlog.h"
 219
 220 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 221
 222
 223 /*********************************************************************
 224  *
 225  * Function    :  free_http_request
 226  *
 227  * Description :  Freez a http_request structure
 228  *
 229  * Parameters  :
 230  *          1  :  http = points to a http_request structure to free
 231  *
 232  * Returns     :  N/A
 233  *
 234  *********************************************************************/
 235 void free_http_request(struct http_request *http)
 236 {
 237    assert(http);
 238
 239    freez(http->cmd);
 240    freez(http->ocmd);
 241    freez(http->gpc);
 242    freez(http->host);
 243    freez(http->url);
 244    freez(http->hostport);
 245    freez(http->path);
 246    freez(http->ver);
 247    freez(http->host_ip_addr_str);
 248    freez(http->dbuffer);
 249    freez(http->dvec);
 250    http->dcount = 0;
 251 }
 252
 253
 254 /*********************************************************************
 255  *
 256  * Function    :  init_domain_components
 257  *
 258  * Description :  Splits the domain name so we can compare it
 259  *                against wildcards. It used to be part of
 260  *                parse_http_url, but was separated because the
 261  *                same code is required in chat in case of
 262  *                intercepted requests.
 263  *
 264  * Parameters  :
 265  *          1  :  http = pointer to the http structure to hold elements.
 266  *
 267  * Returns     :  JB_ERR_OK on success
 268  *                JB_ERR_MEMORY on out of memory
 269  *                JB_ERR_PARSE on malformed command/URL
 270  *                             or >100 domains deep.
 271  *
 272  *********************************************************************/
 273 jb_err init_domain_components(struct http_request *http)
 274 {
 275    char *vec[BUFFER_SIZE];
 276    size_t size;
 277    char *p;
 278
 279    http->dbuffer = strdup(http->host);
 280    if (NULL == http->dbuffer)
 281    {
 282       return JB_ERR_MEMORY;
 283    }
 284
 285    /* map to lower case */
 286    for (p = http->dbuffer; *p ; p++)
 287    {
 288       *p = (char)tolower((int)(unsigned char)*p);
 289    }
 290
 291    /* split the domain name into components */
 292    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 293
 294    if (http->dcount <= 0)
 295    {
 296       /*
 297        * Error: More than SZ(vec) components in domain
 298        *    or: no components in domain
 299        */
 300       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 301       return JB_ERR_PARSE;
 302    }
 303
 304    /* save a copy of the pointers in dvec */
 305    size = (size_t)http->dcount * sizeof(*http->dvec);
 306
 307    http->dvec = (char **)malloc(size);
 308    if (NULL == http->dvec)
 309    {
 310       return JB_ERR_MEMORY;
 311    }
 312
 313    memcpy(http->dvec, vec, size);
 314
 315    return JB_ERR_OK;
 316 }
 317
 318
 319 /*********************************************************************
 320  *
 321  * Function    :  parse_http_url
 322  *
 323  * Description :  Parse out the host and port from the URL.  Find the
 324  *                hostname & path, port (if ':'), and/or password (if '@')
 325  *
 326  * Parameters  :
 327  *          1  :  url = URL (or is it URI?) to break down
 328  *          2  :  http = pointer to the http structure to hold elements.
 329  *                       Will be zeroed before use.  Note that this
 330  *                       function sets the http->gpc and http->ver
 331  *                       members to NULL.
 332  *          3  :  csp = Current client state (buffers, headers, etc...)
 333  *
 334  * Returns     :  JB_ERR_OK on success
 335  *                JB_ERR_MEMORY on out of memory
 336  *                JB_ERR_PARSE on malformed command/URL
 337  *                             or >100 domains deep.
 338  *
 339  *********************************************************************/
 340 jb_err parse_http_url(const char * url,
 341                       struct http_request *http,
 342                       const struct client_state *csp)
 343 {
 344    int host_available = 1; /* A proxy can dream. */
 345
 346    /*
 347     * Zero out the results structure
 348     */
 349    memset(http, '\0', sizeof(*http));
 350
 351
 352    /*
 353     * Save our initial URL
 354     */
 355    http->url = strdup(url);
 356    if (http->url == NULL)
 357    {
 358       return JB_ERR_MEMORY;
 359    }
 360
 361
 362    /*
 363     * Check for * URI. If found, we're done.
 364     */
 365    if (*http->url == '*')
 366    {
 367       if  ( NULL == (http->path = strdup("*"))
 368          || NULL == (http->hostport = strdup("")) )
 369       {
 370          return JB_ERR_MEMORY;
 371       }
 372       if (http->url[1] != '\0')
 373       {
 374          return JB_ERR_PARSE;
 375       }
 376       return JB_ERR_OK;
 377    }
 378
 379
 380    /*
 381     * Split URL into protocol,hostport,path.
 382     */
 383    {
 384       char *buf;
 385       char *url_noproto;
 386       char *url_path;
 387
 388       buf = strdup(url);
 389       if (buf == NULL)
 390       {
 391          return JB_ERR_MEMORY;
 392       }
 393
 394       /* Find the start of the URL in our scratch space */
 395       url_noproto = buf;
 396       if (strncmpic(url_noproto, "http://",  7) == 0)
 397       {
 398          url_noproto += 7;
 399          http->ssl = 0;
 400       }
 401       else if (strncmpic(url_noproto, "https://", 8) == 0)
 402       {
 403          url_noproto += 8;
 404          http->ssl = 1;
 405       }
 406       else if (*url_noproto == '/')
 407       {
 408         /*
 409          * Short request line without protocol and host.
 410          * Most likely because the client's request
 411          * was intercepted and redirected into Privoxy.
 412          */
 413          http->ssl = 0;
 414          http->host = NULL;
 415          host_available = 0;
 416       }
 417       else
 418       {
 419          http->ssl = 0;
 420       }
 421
 422       url_path = strchr(url_noproto, '/');
 423       if (url_path != NULL)
 424       {
 425          /*
 426           * Got a path.
 427           *
 428           * NOTE: The following line ignores the path for HTTPS URLS.
 429           * This means that you get consistent behaviour if you type a
 430           * https URL in and it's parsed by the function.  (When the
 431           * URL is actually retrieved, SSL hides the path part).
 432           */
 433          http->path = strdup(http->ssl ? "/" : url_path);
 434          *url_path = '\0';
 435          http->hostport = strdup(url_noproto);
 436       }
 437       else
 438       {
 439          /*
 440           * Repair broken HTTP requests that don't contain a path,
 441           * or CONNECT requests
 442           */
 443          http->path = strdup("/");
 444          http->hostport = strdup(url_noproto);
 445       }
 446
 447       freez(buf);
 448
 449       if ( (http->path == NULL)
 450         || (http->hostport == NULL))
 451       {
 452          return JB_ERR_MEMORY;
 453       }
 454    }
 455
 456    if (!host_available)
 457    {
 458       /* Without host, there is nothing left to do here */
 459       return JB_ERR_OK;
 460    }
 461
 462    /*
 463     * Split hostport into user/password (ignored), host, port.
 464     */
 465    {
 466       char *buf;
 467       char *host;
 468       char *port;
 469
 470       buf = strdup(http->hostport);
 471       if (buf == NULL)
 472       {
 473          return JB_ERR_MEMORY;
 474       }
 475
 476       /* check if url contains username and/or password */
 477       host = strchr(buf, '@');
 478       if (host != NULL)
 479       {
 480          /* Contains username/password, skip it and the @ sign. */
 481          host++;
 482       }
 483       else
 484       {
 485          /* No username or password. */
 486          host = buf;
 487       }
 488
 489       /* check if url contains port */
 490       port = strchr(host, ':');
 491       if (port != NULL)
 492       {
 493          /* Contains port */
 494          /* Terminate hostname and point to start of port string */
 495          *port++ = '\0';
 496          http->port = atoi(port);
 497       }
 498       else
 499       {
 500          /* No port specified. */
 501          http->port = (http->ssl ? 443 : 80);
 502       }
 503
 504       http->host = strdup(host);
 505
 506       free(buf);
 507
 508       if (http->host == NULL)
 509       {
 510          return JB_ERR_MEMORY;
 511       }
 512    }
 513
 514    /*
 515     * Split domain name so we can compare it against wildcards
 516     */
 517    return init_domain_components(http);
 518
 519 }
 520
 521
 522 /*********************************************************************
 523  *
 524  * Function    :  unknown_method
 525  *
 526  * Description :  Checks whether a method is unknown.
 527  *
 528  * Parameters  :
 529  *          1  :  method = points to a http method
 530  *
 531  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 532  *
 533  *********************************************************************/
 534 static int unknown_method(const char *method)
 535 {
 536    static const char *known_http_methods[] = {
 537       /* Basic HTTP request type */
 538       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 539       /* webDAV extensions (RFC2518) */
 540       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 541       /*
 542        * Microsoft webDAV extension for Exchange 2000.  See:
 543        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 544        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 545        */
 546       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 547       /*
 548        * Another Microsoft webDAV extension for Exchange 2000.  See:
 549        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 550        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 551        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 552        */
 553       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 554       /*
 555        * Yet another WebDAV extension, this time for
 556        * Web Distributed Authoring and Versioning (RFC3253)
 557        */
 558       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 559       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 560    };
 561    int i;
 562
 563    for (i = 0; i < SZ(known_http_methods); i++)
 564    {
 565       if (0 == strcmpic(method, known_http_methods[i]))
 566       {
 567          return FALSE;
 568       }
 569    }
 570
 571    return TRUE;
 572
 573 }
 574
 575
 576 /*********************************************************************
 577  *
 578  * Function    :  parse_http_request
 579  *
 580  * Description :  Parse out the host and port from the URL.  Find the
 581  *                hostname & path, port (if ':'), and/or password (if '@')
 582  *
 583  * Parameters  :
 584  *          1  :  req = HTTP request line to break down
 585  *          2  :  http = pointer to the http structure to hold elements
 586  *          3  :  csp = Current client state (buffers, headers, etc...)
 587  *
 588  * Returns     :  JB_ERR_OK on success
 589  *                JB_ERR_MEMORY on out of memory
 590  *                JB_ERR_CGI_PARAMS on malformed command/URL
 591  *                                  or >100 domains deep.
 592  *
 593  *********************************************************************/
 594 jb_err parse_http_request(const char *req,
 595                           struct http_request *http,
 596                           const struct client_state *csp)
 597 {
 598    char *buf;
 599    char *v[10]; /* XXX: Why 10? We should only need three. */
 600    int n;
 601    jb_err err;
 602    int is_connect = 0;
 603
 604    memset(http, '\0', sizeof(*http));
 605
 606    buf = strdup(req);
 607    if (buf == NULL)
 608    {
 609       return JB_ERR_MEMORY;
 610    }
 611
 612    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 613    if (n != 3)
 614    {
 615       free(buf);
 616       return JB_ERR_PARSE;
 617    }
 618
 619    /*
 620     * Fail in case of unknown methods
 621     * which we might not handle correctly.
 622     *
 623     * XXX: There should be a config option
 624     * to forward requests with unknown methods
 625     * anyway. Most of them don't need special
 626     * steps.
 627     */
 628    if (unknown_method(v[0]))
 629    {
 630       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 631       free(buf);
 632       return JB_ERR_PARSE;
 633    }
 634
 635    if (strcmpic(v[0], "CONNECT") == 0)
 636    {
 637       is_connect = 1;
 638    }
 639
 640    err = parse_http_url(v[1], http, csp);
 641    if (err)
 642    {
 643       free(buf);
 644       return err;
 645    }
 646
 647    /*
 648     * Copy the details into the structure
 649     */
 650    http->ssl = is_connect;
 651    http->cmd = strdup(req);
 652    http->gpc = strdup(v[0]);
 653    http->ver = strdup(v[2]);
 654
 655    if ( (http->cmd == NULL)
 656      || (http->gpc == NULL)
 657      || (http->ver == NULL) )
 658    {
 659       free(buf);
 660       return JB_ERR_MEMORY;
 661    }
 662
 663    free(buf);
 664    return JB_ERR_OK;
 665
 666 }
 667
 668
 669 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 670 /*********************************************************************
 671  *
 672  * Function    :  compile_host_pattern
 673  *
 674  * Description :  Parses and compiles a PCRE host pattern..
 675  *
 676  * Parameters  :
 677  *          1  :  url = Target url_spec to be filled in.
 678  *          2  :  host_pattern = Host pattern to compile.
 679  *
 680  * Returns     :  JB_ERR_OK - Success
 681  *                JB_ERR_MEMORY - Out of memory
 682  *                JB_ERR_PARSE - Cannot parse regex
 683  *
 684  *********************************************************************/
 685 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 686 {
 687    int errcode;
 688    char rebuf[BUFFER_SIZE];
 689
 690    assert(host_pattern);
 691    assert(strlen(host_pattern) < sizeof(rebuf) - 2);
 692
 693    url->host_regex = zalloc(sizeof(*url->host_regex));
 694    if (NULL == url->host_regex)
 695    {
 696       free_url_spec(url);
 697       return JB_ERR_MEMORY;
 698    }
 699
 700    snprintf(rebuf, sizeof(rebuf), "%s$", host_pattern);
 701
 702    errcode = regcomp(url->host_regex, rebuf,
 703       (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 704
 705    if (errcode)
 706    {
 707       size_t errlen = regerror(errcode, url->host_regex, rebuf, sizeof(rebuf));
 708       if (errlen > (sizeof(rebuf) - (size_t)1))
 709       {
 710          errlen = sizeof(rebuf) - (size_t)1;
 711       }
 712       rebuf[errlen] = '\0';
 713       log_error(LOG_LEVEL_ERROR, "error compiling %s: %s", url->spec, rebuf);
 714       free_url_spec(url);
 715
 716       return JB_ERR_PARSE;
 717    }
 718
 719    return JB_ERR_OK;
 720
 721 }
 722
 723 #else
 724
 725 /*********************************************************************
 726  *
 727  * Function    :  compile_host_pattern
 728  *
 729  * Description :  Parses and "compiles" an old-school host pattern.
 730  *
 731  * Parameters  :
 732  *          1  :  url = Target url_spec to be filled in.
 733  *          2  :  host_pattern = Host pattern to parse.
 734  *
 735  * Returns     :  JB_ERR_OK - Success
 736  *                JB_ERR_MEMORY - Out of memory
 737  *                JB_ERR_PARSE - Cannot parse regex
 738  *
 739  *********************************************************************/
 740 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 741 {
 742    char *v[150];
 743    size_t size;
 744    char *p;
 745
 746    /*
 747     * Parse domain part
 748     */
 749    if (host_pattern[strlen(host_pattern) - 1] == '.')
 750    {
 751       url->unanchored |= ANCHOR_RIGHT;
 752    }
 753    if (host_pattern[0] == '.')
 754    {
 755       url->unanchored |= ANCHOR_LEFT;
 756    }
 757
 758    /*
 759     * Split domain into components
 760     */
 761    url->dbuffer = strdup(host_pattern);
 762    if (NULL == url->dbuffer)
 763    {
 764       free_url_spec(url);
 765       return JB_ERR_MEMORY;
 766    }
 767
 768    /*
 769     * Map to lower case
 770     */
 771    for (p = url->dbuffer; *p ; p++)
 772    {
 773       *p = (char)tolower((int)(unsigned char)*p);
 774    }
 775
 776    /*
 777     * Split the domain name into components
 778     */
 779    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 780
 781    if (url->dcount < 0)
 782    {
 783       free_url_spec(url);
 784       return JB_ERR_MEMORY;
 785    }
 786    else if (url->dcount != 0)
 787    {
 788       /*
 789        * Save a copy of the pointers in dvec
 790        */
 791       size = (size_t)url->dcount * sizeof(*url->dvec);
 792
 793       url->dvec = (char **)malloc(size);
 794       if (NULL == url->dvec)
 795       {
 796          free_url_spec(url);
 797          return JB_ERR_MEMORY;
 798       }
 799
 800       memcpy(url->dvec, v, size);
 801    }
 802    /*
 803     * else dcount == 0 in which case we needn't do anything,
 804     * since dvec will never be accessed and the pattern will
 805     * match all domains.
 806     */
 807    return JB_ERR_OK;
 808 }
 809
 810
 811 /*********************************************************************
 812  *
 813  * Function    :  simple_domaincmp
 814  *
 815  * Description :  Domain-wise Compare fqdn's.  The comparison is
 816  *                both left- and right-anchored.  The individual
 817  *                domain names are compared with simplematch().
 818  *                This is only used by domain_match.
 819  *
 820  * Parameters  :
 821  *          1  :  pv = array of patterns to compare
 822  *          2  :  fv = array of domain components to compare
 823  *          3  :  len = length of the arrays (both arrays are the
 824  *                      same length - if they weren't, it couldn't
 825  *                      possibly be a match).
 826  *
 827  * Returns     :  0 => domains are equivalent, else no match.
 828  *
 829  *********************************************************************/
 830 static int simple_domaincmp(char **pv, char **fv, int len)
 831 {
 832    int n;
 833
 834    for (n = 0; n < len; n++)
 835    {
 836       if (simplematch(pv[n], fv[n]))
 837       {
 838          return 1;
 839       }
 840    }
 841
 842    return 0;
 843
 844 }
 845
 846
 847 /*********************************************************************
 848  *
 849  * Function    :  domain_match
 850  *
 851  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 852  *                pattern->unachored, the comparison is un-, left-,
 853  *                right-anchored, or both.
 854  *                The individual domain names are compared with
 855  *                simplematch().
 856  *
 857  * Parameters  :
 858  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 859  *          2  :  fqdn = domain name against which the patterns are compared.
 860  *
 861  * Returns     :  0 => domains are equivalent, else no match.
 862  *
 863  *********************************************************************/
 864 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 865 {
 866    char **pv, **fv;  /* vectors  */
 867    int    plen, flen;
 868    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 869
 870    plen = pattern->dcount;
 871    flen = fqdn->dcount;
 872
 873    if (flen < plen)
 874    {
 875       /* fqdn is too short to match this pattern */
 876       return 1;
 877    }
 878
 879    pv   = pattern->dvec;
 880    fv   = fqdn->dvec;
 881
 882    if (unanchored == ANCHOR_LEFT)
 883    {
 884       /*
 885        * Right anchored.
 886        *
 887        * Convert this into a fully anchored pattern with
 888        * the fqdn and pattern the same length
 889        */
 890       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 891       return simple_domaincmp(pv, fv, plen);
 892    }
 893    else if (unanchored == 0)
 894    {
 895       /* Fully anchored, check length */
 896       if (flen != plen)
 897       {
 898          return 1;
 899       }
 900       return simple_domaincmp(pv, fv, plen);
 901    }
 902    else if (unanchored == ANCHOR_RIGHT)
 903    {
 904       /* Left anchored, ignore all extra in fqdn */
 905       return simple_domaincmp(pv, fv, plen);
 906    }
 907    else
 908    {
 909       /* Unanchored */
 910       int n;
 911       int maxn = flen - plen;
 912       for (n = 0; n <= maxn; n++)
 913       {
 914          if (!simple_domaincmp(pv, fv, plen))
 915          {
 916             return 0;
 917          }
 918          /*
 919           * Doesn't match from start of fqdn
 920           * Try skipping first part of fqdn
 921           */
 922          fv++;
 923       }
 924       return 1;
 925    }
 926
 927 }
 928 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 929
 930
 931 /*********************************************************************
 932  *
 933  * Function    :  create_url_spec
 934  *
 935  * Description :  Creates a "url_spec" structure from a string.
 936  *                When finished, free with free_url_spec().
 937  *
 938  * Parameters  :
 939  *          1  :  url = Target url_spec to be filled in.  Will be
 940  *                      zeroed before use.
 941  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 942  *                      contents of this buffer are destroyed by this
 943  *                      function.  If this function succeeds, the
 944  *                      buffer is copied to url->spec.  If this
 945  *                      function fails, the contents of the buffer
 946  *                      are lost forever.
 947  *
 948  * Returns     :  JB_ERR_OK - Success
 949  *                JB_ERR_MEMORY - Out of memory
 950  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 951  *                               written to system log)
 952  *
 953  *********************************************************************/
 954 jb_err create_url_spec(struct url_spec * url, const char * buf)
 955 {
 956    char *p;
 957    int errcode;
 958    size_t errlen;
 959    char rebuf[BUFFER_SIZE];
 960
 961    assert(url);
 962    assert(buf);
 963
 964    /*
 965     * Zero memory
 966     */
 967    memset(url, '\0', sizeof(*url));
 968
 969    /*
 970     * Save a copy of the orignal specification
 971     */
 972    if ((url->spec = strdup(buf)) == NULL)
 973    {
 974       return JB_ERR_MEMORY;
 975    }
 976
 977    /* Is it tag pattern? */
 978    if (0 == strncmpic("TAG:", url->spec, 4))
 979    {
 980       if (NULL == (url->tag_regex = zalloc(sizeof(*url->tag_regex))))
 981       {
 982          freez(url->spec);
 983          return JB_ERR_MEMORY;
 984       }
 985
 986       /* buf + 4 to skip "TAG:" */
 987       errcode = regcomp(url->tag_regex, buf + 4, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 988       if (errcode)
 989       {
 990          errlen = regerror(errcode, url->preg, rebuf, sizeof(rebuf));
 991          if (errlen > (sizeof(rebuf) - 1))
 992          {
 993             errlen = sizeof(rebuf) - 1;
 994          }
 995          rebuf[errlen] = '\0';
 996          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s", url->spec, rebuf);
 997          free_url_spec(url);
 998
 999          return JB_ERR_PARSE;
1000       }
1001       return JB_ERR_OK;
1002    }
1003
1004    /* Only reached for URL patterns */
1005    p = strchr(buf, '/');
1006    if (NULL != p)
1007    {
1008       /*
1009        * Only compile the regex if it consists of more than
1010        * a single slash, otherwise it wouldn't affect the result.
1011        */
1012       if (*(p+1) != '\0')
1013       {
1014          /* XXX: mostly duplicated code, should be factored out. */
1015          url->preg = zalloc(sizeof(*url->preg));
1016          if (NULL == url->preg)
1017          {
1018             free_url_spec(url);
1019             return JB_ERR_MEMORY;
1020          }
1021
1022          snprintf(rebuf, sizeof(rebuf), "^(%s)", p);
1023          errcode = regcomp(url->preg, rebuf,
1024             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
1025          if (errcode)
1026          {
1027             errlen = regerror(errcode, url->preg, rebuf, sizeof(rebuf));
1028
1029             if (errlen > (sizeof(rebuf) - (size_t)1))
1030             {
1031                errlen = sizeof(rebuf) - (size_t)1;
1032             }
1033             rebuf[errlen] = '\0';
1034             log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
1035                url->spec, rebuf);
1036             free_url_spec(url);
1037
1038             return JB_ERR_PARSE;
1039          }
1040       }
1041       *p = '\0';
1042    }
1043
1044    p = strchr(buf, ':');
1045    if (NULL != p)
1046    {
1047       *p++ = '\0';
1048       url->port_list = strdup(p);
1049       if (NULL == url->port_list)
1050       {
1051          return JB_ERR_MEMORY;
1052       }
1053    }
1054    else
1055    {
1056       url->port_list = NULL;
1057    }
1058
1059    if (buf[0] != '\0')
1060    {
1061       return compile_host_pattern(url, buf);
1062    }
1063
1064    return JB_ERR_OK;
1065
1066 }
1067
1068
1069 /*********************************************************************
1070  *
1071  * Function    :  free_url_spec
1072  *
1073  * Description :  Called from the "unloaders".  Freez the url
1074  *                structure elements.
1075  *
1076  * Parameters  :
1077  *          1  :  url = pointer to a url_spec structure.
1078  *
1079  * Returns     :  N/A
1080  *
1081  *********************************************************************/
1082 void free_url_spec(struct url_spec *url)
1083 {
1084    if (url == NULL) return;
1085
1086    freez(url->spec);
1087 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1088    if (url->host_regex)
1089    {
1090       regfree(url->host_regex);
1091       freez(url->host_regex);
1092    }
1093 #else
1094    freez(url->dbuffer);
1095    freez(url->dvec);
1096    url->dcount = 0;
1097 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1098    freez(url->port_list);
1099    if (url->preg)
1100    {
1101       regfree(url->preg);
1102       freez(url->preg);
1103    }
1104    if (url->tag_regex)
1105    {
1106       regfree(url->tag_regex);
1107       freez(url->tag_regex);
1108    }
1109 }
1110
1111
1112 /*********************************************************************
1113  *
1114  * Function    :  url_match
1115  *
1116  * Description :  Compare a URL against a URL pattern.
1117  *
1118  * Parameters  :
1119  *          1  :  pattern = a URL pattern
1120  *          2  :  url = URL to match
1121  *
1122  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1123  *
1124  *********************************************************************/
1125 int url_match(const struct url_spec *pattern,
1126               const struct http_request *http)
1127 {
1128    /* XXX: these should probably be functions. */
1129 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
1130 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1131 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
1132 #else
1133 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
1134 #endif
1135 #define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
1136
1137    if (pattern->tag_regex != NULL)
1138    {
1139       /* It's a tag pattern and shouldn't be matched against URLs */
1140       return 0;
1141    }
1142
1143    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1144
1145 }
1146
1147
1148 /*********************************************************************
1149  *
1150  * Function    :  match_portlist
1151  *
1152  * Description :  Check if a given number is covered by a comma
1153  *                separated list of numbers and ranges (a,b-c,d,..)
1154  *
1155  * Parameters  :
1156  *          1  :  portlist = String with list
1157  *          2  :  port = port to check
1158  *
1159  * Returns     :  0 => no match
1160  *                1 => match
1161  *
1162  *********************************************************************/
1163 int match_portlist(const char *portlist, int port)
1164 {
1165    char *min, *max, *next, *portlist_copy;
1166
1167    min = next = portlist_copy = strdup(portlist);
1168
1169    /*
1170     * Zero-terminate first item and remember offset for next
1171     */
1172    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1173    {
1174       *next++ = '\0';
1175    }
1176
1177    /*
1178     * Loop through all items, checking for match
1179     */
1180    while(min)
1181    {
1182       if (NULL == (max = strchr(min, (int) '-')))
1183       {
1184          /*
1185           * No dash, check for equality
1186           */
1187          if (port == atoi(min))
1188          {
1189             free(portlist_copy);
1190             return(1);
1191          }
1192       }
1193       else
1194       {
1195          /*
1196           * This is a range, so check if between min and max,
1197           * or, if max was omitted, between min and 65K
1198           */
1199          *max++ = '\0';
1200          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1201          {
1202             free(portlist_copy);
1203             return(1);
1204          }
1205
1206       }
1207
1208       /*
1209        * Jump to next item
1210        */
1211       min = next;
1212
1213       /*
1214        * Zero-terminate next item and remember offset for n+1
1215        */
1216       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1217       {
1218          *next++ = '\0';
1219       }
1220    }
1221
1222    free(portlist_copy);
1223    return 0;
1224
1225 }
1226
1227
1228 /*
1229   Local Variables:
1230   tab-width: 3
1231   end:
1232 */