urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.38 2008/04/18 05:17:18 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2003, 2006-2008 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.38  2008/04/18 05:17:18  fabiankeil
  37  *    Mark simplematch()'s parameters as immutable.
  38  *
  39  *    Revision 1.37  2008/04/17 14:53:29  fabiankeil
  40  *    Move simplematch() into urlmatch.c as it's only
  41  *    used to match (old-school) domain patterns.
  42  *
  43  *    Revision 1.36  2008/04/14 18:19:48  fabiankeil
  44  *    Remove now-pointless cast in create_url_spec().
  45  *
  46  *    Revision 1.35  2008/04/14 18:11:21  fabiankeil
  47  *    The compiler might not notice it, but the buffer passed to
  48  *    create_url_spec() is modified later on and thus shouldn't
  49  *    be declared immutable.
  50  *
  51  *    Revision 1.34  2008/04/13 13:32:07  fabiankeil
  52  *    Factor URL pattern compilation out of create_url_spec().
  53  *
  54  *    Revision 1.33  2008/04/12 14:03:13  fabiankeil
  55  *    Remove an obvious comment and improve another one.
  56  *
  57  *    Revision 1.32  2008/04/12 12:38:06  fabiankeil
  58  *    Factor out duplicated code to compile host, path and tag patterns.
  59  *
  60  *    Revision 1.31  2008/04/10 14:41:04  fabiankeil
  61  *    Ditch url_spec's path member now that it's no longer used.
  62  *
  63  *    Revision 1.30  2008/04/10 04:24:24  fabiankeil
  64  *    Stop duplicating the plain text representation of the path regex
  65  *    (and keeping the copy around). Once the regex is compiled it's no
  66  *    longer useful.
  67  *
  68  *    Revision 1.29  2008/04/10 04:17:56  fabiankeil
  69  *    In url_match(), check the right member for NULL when determining
  70  *    whether there's a path regex to execute. Looking for a plain-text
  71  *    representation works as well, but it looks "interesting" and that
  72  *    member will be removed soonish anyway.
  73  *
  74  *    Revision 1.28  2008/04/08 16:07:39  fabiankeil
  75  *    Make it harder to mistake url_match()'s
  76  *    second parameter for an url_spec.
  77  *
  78  *    Revision 1.27  2008/04/08 15:44:33  fabiankeil
  79  *    Save a bit of memory (and a few cpu cycles) by not bothering to
  80  *    compile slash-only path regexes that don't affect the result.
  81  *
  82  *    Revision 1.26  2008/04/07 16:57:18  fabiankeil
  83  *    - Use free_url_spec() more consistently.
  84  *    - Let it reset url->dcount just in case.
  85  *
  86  *    Revision 1.25  2008/04/06 15:18:38  fabiankeil
  87  *    Oh well, rename the --enable-pcre-host-patterns option to
  88  *    --enable-extended-host-patterns as it's not really PCRE syntax.
  89  *
  90  *    Revision 1.24  2008/04/06 14:54:26  fabiankeil
  91  *    Use PCRE syntax in host patterns when configured
  92  *    with --enable-pcre-host-patterns.
  93  *
  94  *    Revision 1.23  2008/04/05 12:19:20  fabiankeil
  95  *    Factor compile_host_pattern() out of create_url_spec().
  96  *
  97  *    Revision 1.22  2008/03/30 15:02:32  fabiankeil
  98  *    SZitify unknown_method().
  99  *
 100  *    Revision 1.21  2007/12/24 16:34:23  fabiankeil
 101  *    Band-aid (and micro-optimization) that makes it less likely to run out of
 102  *    stack space with overly-complex path patterns. Probably masks the problem
 103  *    reported by Lee in #1856679. Hohoho.
 104  *
 105  *    Revision 1.20  2007/09/02 15:31:20  fabiankeil
 106  *    Move match_portlist() from filter.c to urlmatch.c.
 107  *    It's used for url matching, not for filtering.
 108  *
 109  *    Revision 1.19  2007/09/02 13:42:11  fabiankeil
 110  *    - Allow port lists in url patterns.
 111  *    - Ditch unused url_spec member pathlen.
 112  *
 113  *    Revision 1.18  2007/07/30 16:42:21  fabiankeil
 114  *    Move the method check into unknown_method()
 115  *    and loop through the known methods instead
 116  *    of using a screen-long OR chain.
 117  *
 118  *    Revision 1.17  2007/04/15 16:39:21  fabiankeil
 119  *    Introduce tags as alternative way to specify which
 120  *    actions apply to a request. At the moment tags can be
 121  *    created based on client and server headers.
 122  *
 123  *    Revision 1.16  2007/02/13 13:59:24  fabiankeil
 124  *    Remove redundant log message.
 125  *
 126  *    Revision 1.15  2007/01/28 16:11:23  fabiankeil
 127  *    Accept WebDAV methods for subversion
 128  *    in parse_http_request(). Closes FR 1581425.
 129  *
 130  *    Revision 1.14  2007/01/06 14:23:56  fabiankeil
 131  *    Fix gcc43 warnings. Mark *csp as immutable
 132  *    for parse_http_url() and url_match().
 133  *    Replace a sprintf call with snprintf.
 134  *
 135  *    Revision 1.13  2006/12/06 19:50:54  fabiankeil
 136  *    parse_http_url() now handles intercepted
 137  *    HTTP request lines as well. Moved parts
 138  *    of parse_http_url()'s code into
 139  *    init_domain_components() so that it can
 140  *    be reused in chat().
 141  *
 142  *    Revision 1.12  2006/07/18 14:48:47  david__schmidt
 143  *    Reorganizing the repository: swapping out what was HEAD (the old 3.1 branch)
 144  *    with what was really the latest development (the v_3_0_branch branch)
 145  *
 146  *    Revision 1.10.2.7  2003/05/17 15:57:24  oes
 147  *     - parse_http_url now checks memory allocation failure for
 148  *       duplication of "*" URL and rejects "*something" URLs
 149  *       Closes bug #736344
 150  *     - Added a comment to what might look like a bug in
 151  *       create_url_spec (see !bug #736931)
 152  *     - Comment cosmetics
 153  *
 154  *    Revision 1.10.2.6  2003/05/07 12:39:48  oes
 155  *    Fix typo: Default port for https URLs is 443, not 143.
 156  *    Thanks to Scott Tregear for spotting this one.
 157  *
 158  *    Revision 1.10.2.5  2003/02/28 13:09:29  oes
 159  *    Fixed a rare double free condition as per Bug #694713
 160  *
 161  *    Revision 1.10.2.4  2003/02/28 12:57:44  oes
 162  *    Moved freeing of http request structure to its owner
 163  *    as per Dan Price's observations in Bug #694713
 164  *
 165  *    Revision 1.10.2.3  2002/11/12 16:50:40  oes
 166  *    Fixed memory leak in parse_http_request() reported by Oliver Stoeneberg. Fixes bug #637073
 167  *
 168  *    Revision 1.10.2.2  2002/09/25 14:53:15  oes
 169  *    Added basic support for OPTIONS and TRACE HTTP methods:
 170  *    parse_http_url now recognizes the "*" URI as well as
 171  *    the OPTIONS and TRACE method keywords.
 172  *
 173  *    Revision 1.10.2.1  2002/06/06 19:06:44  jongfoster
 174  *    Adding support for proprietary Microsoft WebDAV extensions
 175  *
 176  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
 177  *    - Removing some unused code
 178  *
 179  *    Revision 1.9  2002/04/04 00:36:36  gliptak
 180  *    always use pcre for matching
 181  *
 182  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
 183  *    Fixing memory leak on error
 184  *
 185  *    Revision 1.7  2002/03/26 22:29:55  swa
 186  *    we have a new homepage!
 187  *
 188  *    Revision 1.6  2002/03/24 13:25:43  swa
 189  *    name change related issues
 190  *
 191  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
 192  *    Killing warnings
 193  *
 194  *    Revision 1.4  2002/03/07 03:46:17  oes
 195  *    Fixed compiler warnings
 196  *
 197  *    Revision 1.3  2002/03/03 14:51:11  oes
 198  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
 199  *
 200  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
 201  *    Correcting comment style
 202  *    Fixing an uninitialized memory bug in create_url_spec()
 203  *
 204  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
 205  *    Moving all our URL and URL pattern parsing code to the same file - it
 206  *    was scattered around in filters.c, loaders.c and parsers.c.
 207  *
 208  *    Providing a single, simple url_match(pattern,url) function - rather than
 209  *    the 3-line match routine which was repeated all over the place.
 210  *
 211  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
 212  *
 213  *    Providing parse_http_url() so that URLs can be parsed without faking a
 214  *    HTTP request line for parse_http_request() or repeating the parsing
 215  *    code (both of which were techniques that were actually in use).
 216  *
 217  *    Standardizing that struct http_request is used to represent a URL, and
 218  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
 219  *    represented as seperate variables and a partially-filled-in url_spec).
 220  *
 221  *
 222  *********************************************************************/
 223 \f
 224
 225 #include "config.h"
 226
 227 #ifndef _WIN32
 228 #include <stdio.h>
 229 #include <sys/types.h>
 230 #endif
 231
 232 #include <stdlib.h>
 233 #include <ctype.h>
 234 #include <assert.h>
 235 #include <string.h>
 236
 237 #if !defined(_WIN32) && !defined(__OS2__)
 238 #include <unistd.h>
 239 #endif
 240
 241 #include "project.h"
 242 #include "urlmatch.h"
 243 #include "ssplit.h"
 244 #include "miscutil.h"
 245 #include "errlog.h"
 246
 247 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 248
 249 enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
 250 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
 251
 252 /*********************************************************************
 253  *
 254  * Function    :  free_http_request
 255  *
 256  * Description :  Freez a http_request structure
 257  *
 258  * Parameters  :
 259  *          1  :  http = points to a http_request structure to free
 260  *
 261  * Returns     :  N/A
 262  *
 263  *********************************************************************/
 264 void free_http_request(struct http_request *http)
 265 {
 266    assert(http);
 267
 268    freez(http->cmd);
 269    freez(http->ocmd);
 270    freez(http->gpc);
 271    freez(http->host);
 272    freez(http->url);
 273    freez(http->hostport);
 274    freez(http->path);
 275    freez(http->ver);
 276    freez(http->host_ip_addr_str);
 277    freez(http->dbuffer);
 278    freez(http->dvec);
 279    http->dcount = 0;
 280 }
 281
 282
 283 /*********************************************************************
 284  *
 285  * Function    :  init_domain_components
 286  *
 287  * Description :  Splits the domain name so we can compare it
 288  *                against wildcards. It used to be part of
 289  *                parse_http_url, but was separated because the
 290  *                same code is required in chat in case of
 291  *                intercepted requests.
 292  *
 293  * Parameters  :
 294  *          1  :  http = pointer to the http structure to hold elements.
 295  *
 296  * Returns     :  JB_ERR_OK on success
 297  *                JB_ERR_MEMORY on out of memory
 298  *                JB_ERR_PARSE on malformed command/URL
 299  *                             or >100 domains deep.
 300  *
 301  *********************************************************************/
 302 jb_err init_domain_components(struct http_request *http)
 303 {
 304    char *vec[BUFFER_SIZE];
 305    size_t size;
 306    char *p;
 307
 308    http->dbuffer = strdup(http->host);
 309    if (NULL == http->dbuffer)
 310    {
 311       return JB_ERR_MEMORY;
 312    }
 313
 314    /* map to lower case */
 315    for (p = http->dbuffer; *p ; p++)
 316    {
 317       *p = (char)tolower((int)(unsigned char)*p);
 318    }
 319
 320    /* split the domain name into components */
 321    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 322
 323    if (http->dcount <= 0)
 324    {
 325       /*
 326        * Error: More than SZ(vec) components in domain
 327        *    or: no components in domain
 328        */
 329       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 330       return JB_ERR_PARSE;
 331    }
 332
 333    /* save a copy of the pointers in dvec */
 334    size = (size_t)http->dcount * sizeof(*http->dvec);
 335
 336    http->dvec = (char **)malloc(size);
 337    if (NULL == http->dvec)
 338    {
 339       return JB_ERR_MEMORY;
 340    }
 341
 342    memcpy(http->dvec, vec, size);
 343
 344    return JB_ERR_OK;
 345 }
 346
 347
 348 /*********************************************************************
 349  *
 350  * Function    :  parse_http_url
 351  *
 352  * Description :  Parse out the host and port from the URL.  Find the
 353  *                hostname & path, port (if ':'), and/or password (if '@')
 354  *
 355  * Parameters  :
 356  *          1  :  url = URL (or is it URI?) to break down
 357  *          2  :  http = pointer to the http structure to hold elements.
 358  *                       Will be zeroed before use.  Note that this
 359  *                       function sets the http->gpc and http->ver
 360  *                       members to NULL.
 361  *          3  :  csp = Current client state (buffers, headers, etc...)
 362  *
 363  * Returns     :  JB_ERR_OK on success
 364  *                JB_ERR_MEMORY on out of memory
 365  *                JB_ERR_PARSE on malformed command/URL
 366  *                             or >100 domains deep.
 367  *
 368  *********************************************************************/
 369 jb_err parse_http_url(const char * url,
 370                       struct http_request *http,
 371                       const struct client_state *csp)
 372 {
 373    int host_available = 1; /* A proxy can dream. */
 374
 375    /*
 376     * Zero out the results structure
 377     */
 378    memset(http, '\0', sizeof(*http));
 379
 380
 381    /*
 382     * Save our initial URL
 383     */
 384    http->url = strdup(url);
 385    if (http->url == NULL)
 386    {
 387       return JB_ERR_MEMORY;
 388    }
 389
 390
 391    /*
 392     * Check for * URI. If found, we're done.
 393     */
 394    if (*http->url == '*')
 395    {
 396       if  ( NULL == (http->path = strdup("*"))
 397          || NULL == (http->hostport = strdup("")) )
 398       {
 399          return JB_ERR_MEMORY;
 400       }
 401       if (http->url[1] != '\0')
 402       {
 403          return JB_ERR_PARSE;
 404       }
 405       return JB_ERR_OK;
 406    }
 407
 408
 409    /*
 410     * Split URL into protocol,hostport,path.
 411     */
 412    {
 413       char *buf;
 414       char *url_noproto;
 415       char *url_path;
 416
 417       buf = strdup(url);
 418       if (buf == NULL)
 419       {
 420          return JB_ERR_MEMORY;
 421       }
 422
 423       /* Find the start of the URL in our scratch space */
 424       url_noproto = buf;
 425       if (strncmpic(url_noproto, "http://",  7) == 0)
 426       {
 427          url_noproto += 7;
 428          http->ssl = 0;
 429       }
 430       else if (strncmpic(url_noproto, "https://", 8) == 0)
 431       {
 432          url_noproto += 8;
 433          http->ssl = 1;
 434       }
 435       else if (*url_noproto == '/')
 436       {
 437         /*
 438          * Short request line without protocol and host.
 439          * Most likely because the client's request
 440          * was intercepted and redirected into Privoxy.
 441          */
 442          http->ssl = 0;
 443          http->host = NULL;
 444          host_available = 0;
 445       }
 446       else
 447       {
 448          http->ssl = 0;
 449       }
 450
 451       url_path = strchr(url_noproto, '/');
 452       if (url_path != NULL)
 453       {
 454          /*
 455           * Got a path.
 456           *
 457           * NOTE: The following line ignores the path for HTTPS URLS.
 458           * This means that you get consistent behaviour if you type a
 459           * https URL in and it's parsed by the function.  (When the
 460           * URL is actually retrieved, SSL hides the path part).
 461           */
 462          http->path = strdup(http->ssl ? "/" : url_path);
 463          *url_path = '\0';
 464          http->hostport = strdup(url_noproto);
 465       }
 466       else
 467       {
 468          /*
 469           * Repair broken HTTP requests that don't contain a path,
 470           * or CONNECT requests
 471           */
 472          http->path = strdup("/");
 473          http->hostport = strdup(url_noproto);
 474       }
 475
 476       freez(buf);
 477
 478       if ( (http->path == NULL)
 479         || (http->hostport == NULL))
 480       {
 481          return JB_ERR_MEMORY;
 482       }
 483    }
 484
 485    if (!host_available)
 486    {
 487       /* Without host, there is nothing left to do here */
 488       return JB_ERR_OK;
 489    }
 490
 491    /*
 492     * Split hostport into user/password (ignored), host, port.
 493     */
 494    {
 495       char *buf;
 496       char *host;
 497       char *port;
 498
 499       buf = strdup(http->hostport);
 500       if (buf == NULL)
 501       {
 502          return JB_ERR_MEMORY;
 503       }
 504
 505       /* check if url contains username and/or password */
 506       host = strchr(buf, '@');
 507       if (host != NULL)
 508       {
 509          /* Contains username/password, skip it and the @ sign. */
 510          host++;
 511       }
 512       else
 513       {
 514          /* No username or password. */
 515          host = buf;
 516       }
 517
 518       /* check if url contains port */
 519       port = strchr(host, ':');
 520       if (port != NULL)
 521       {
 522          /* Contains port */
 523          /* Terminate hostname and point to start of port string */
 524          *port++ = '\0';
 525          http->port = atoi(port);
 526       }
 527       else
 528       {
 529          /* No port specified. */
 530          http->port = (http->ssl ? 443 : 80);
 531       }
 532
 533       http->host = strdup(host);
 534
 535       free(buf);
 536
 537       if (http->host == NULL)
 538       {
 539          return JB_ERR_MEMORY;
 540       }
 541    }
 542
 543    /*
 544     * Split domain name so we can compare it against wildcards
 545     */
 546    return init_domain_components(http);
 547
 548 }
 549
 550
 551 /*********************************************************************
 552  *
 553  * Function    :  unknown_method
 554  *
 555  * Description :  Checks whether a method is unknown.
 556  *
 557  * Parameters  :
 558  *          1  :  method = points to a http method
 559  *
 560  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 561  *
 562  *********************************************************************/
 563 static int unknown_method(const char *method)
 564 {
 565    static const char *known_http_methods[] = {
 566       /* Basic HTTP request type */
 567       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 568       /* webDAV extensions (RFC2518) */
 569       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 570       /*
 571        * Microsoft webDAV extension for Exchange 2000.  See:
 572        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 573        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 574        */
 575       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 576       /*
 577        * Another Microsoft webDAV extension for Exchange 2000.  See:
 578        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 579        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 580        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 581        */
 582       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 583       /*
 584        * Yet another WebDAV extension, this time for
 585        * Web Distributed Authoring and Versioning (RFC3253)
 586        */
 587       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 588       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 589    };
 590    int i;
 591
 592    for (i = 0; i < SZ(known_http_methods); i++)
 593    {
 594       if (0 == strcmpic(method, known_http_methods[i]))
 595       {
 596          return FALSE;
 597       }
 598    }
 599
 600    return TRUE;
 601
 602 }
 603
 604
 605 /*********************************************************************
 606  *
 607  * Function    :  parse_http_request
 608  *
 609  * Description :  Parse out the host and port from the URL.  Find the
 610  *                hostname & path, port (if ':'), and/or password (if '@')
 611  *
 612  * Parameters  :
 613  *          1  :  req = HTTP request line to break down
 614  *          2  :  http = pointer to the http structure to hold elements
 615  *          3  :  csp = Current client state (buffers, headers, etc...)
 616  *
 617  * Returns     :  JB_ERR_OK on success
 618  *                JB_ERR_MEMORY on out of memory
 619  *                JB_ERR_CGI_PARAMS on malformed command/URL
 620  *                                  or >100 domains deep.
 621  *
 622  *********************************************************************/
 623 jb_err parse_http_request(const char *req,
 624                           struct http_request *http,
 625                           const struct client_state *csp)
 626 {
 627    char *buf;
 628    char *v[10]; /* XXX: Why 10? We should only need three. */
 629    int n;
 630    jb_err err;
 631
 632    memset(http, '\0', sizeof(*http));
 633
 634    buf = strdup(req);
 635    if (buf == NULL)
 636    {
 637       return JB_ERR_MEMORY;
 638    }
 639
 640    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 641    if (n != 3)
 642    {
 643       free(buf);
 644       return JB_ERR_PARSE;
 645    }
 646
 647    /*
 648     * Fail in case of unknown methods
 649     * which we might not handle correctly.
 650     *
 651     * XXX: There should be a config option
 652     * to forward requests with unknown methods
 653     * anyway. Most of them don't need special
 654     * steps.
 655     */
 656    if (unknown_method(v[0]))
 657    {
 658       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 659       free(buf);
 660       return JB_ERR_PARSE;
 661    }
 662
 663    err = parse_http_url(v[1], http, csp);
 664    if (err)
 665    {
 666       free(buf);
 667       return err;
 668    }
 669
 670    /*
 671     * Copy the details into the structure
 672     */
 673    http->ssl = !strcmpic(v[0], "CONNECT");
 674    http->cmd = strdup(req);
 675    http->gpc = strdup(v[0]);
 676    http->ver = strdup(v[2]);
 677
 678    freez(buf);
 679
 680    if ( (http->cmd == NULL)
 681      || (http->gpc == NULL)
 682      || (http->ver == NULL) )
 683    {
 684       return JB_ERR_MEMORY;
 685    }
 686
 687    return JB_ERR_OK;
 688
 689 }
 690
 691
 692 /*********************************************************************
 693  *
 694  * Function    :  compile_pattern
 695  *
 696  * Description :  Compiles a host, domain or TAG pattern.
 697  *
 698  * Parameters  :
 699  *          1  :  pattern = The pattern to compile.
 700  *          2  :  anchoring = How the regex should be anchored.
 701  *                            Can be either one of NO_ANCHORING,
 702  *                            LEFT_ANCHORED or RIGHT_ANCHORED.
 703  *          3  :  url     = In case of failures, the spec member is
 704  *                          logged and the structure freed.
 705  *          4  :  regex   = Where the compiled regex should be stored.
 706  *
 707  * Returns     :  JB_ERR_OK - Success
 708  *                JB_ERR_MEMORY - Out of memory
 709  *                JB_ERR_PARSE - Cannot parse regex
 710  *
 711  *********************************************************************/
 712 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 713                               struct url_spec *url, regex_t **regex)
 714 {
 715    int errcode;
 716    char rebuf[BUFFER_SIZE];
 717    const char *fmt;
 718
 719    assert(pattern);
 720    assert(strlen(pattern) < sizeof(rebuf) - 2);
 721
 722    if (pattern[0] == '\0')
 723    {
 724       *regex = NULL;
 725       return JB_ERR_OK;
 726    }
 727
 728    switch (anchoring)
 729    {
 730       case NO_ANCHORING:
 731          fmt = "%s";
 732          break;
 733       case RIGHT_ANCHORED:
 734          fmt = "%s$";
 735          break;
 736       case LEFT_ANCHORED:
 737          fmt = "^%s";
 738          break;
 739       default:
 740          log_error(LOG_LEVEL_FATAL,
 741             "Invalid anchoring in compile_pattern %d", anchoring);
 742    }
 743
 744    *regex = zalloc(sizeof(**regex));
 745    if (NULL == *regex)
 746    {
 747       free_url_spec(url);
 748       return JB_ERR_MEMORY;
 749    }
 750
 751    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 752
 753    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 754
 755    if (errcode)
 756    {
 757       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 758       if (errlen > (sizeof(rebuf) - (size_t)1))
 759       {
 760          errlen = sizeof(rebuf) - (size_t)1;
 761       }
 762       rebuf[errlen] = '\0';
 763       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 764          pattern, url->spec, rebuf);
 765       free_url_spec(url);
 766
 767       return JB_ERR_PARSE;
 768    }
 769
 770    return JB_ERR_OK;
 771
 772 }
 773
 774
 775 /*********************************************************************
 776  *
 777  * Function    :  compile_url_pattern
 778  *
 779  * Description :  Compiles the three parts of an URL pattern.
 780  *
 781  * Parameters  :
 782  *          1  :  url = Target url_spec to be filled in.
 783  *          2  :  buf = The url pattern to compile. Will be messed up.
 784  *
 785  * Returns     :  JB_ERR_OK - Success
 786  *                JB_ERR_MEMORY - Out of memory
 787  *                JB_ERR_PARSE - Cannot parse regex
 788  *
 789  *********************************************************************/
 790 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 791 {
 792    char *p;
 793
 794    p = strchr(buf, '/');
 795    if (NULL != p)
 796    {
 797       /*
 798        * Only compile the regex if it consists of more than
 799        * a single slash, otherwise it wouldn't affect the result.
 800        */
 801       if (p[1] != '\0')
 802       {
 803          /*
 804           * XXX: does it make sense to compile the slash at the beginning?
 805           */
 806          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 807
 808          if (JB_ERR_OK != err)
 809          {
 810             return err;
 811          }
 812       }
 813       *p = '\0';
 814    }
 815
 816    p = strchr(buf, ':');
 817    if (NULL != p)
 818    {
 819       *p++ = '\0';
 820       url->port_list = strdup(p);
 821       if (NULL == url->port_list)
 822       {
 823          return JB_ERR_MEMORY;
 824       }
 825    }
 826    else
 827    {
 828       url->port_list = NULL;
 829    }
 830
 831    if (buf[0] != '\0')
 832    {
 833       return compile_host_pattern(url, buf);
 834    }
 835
 836    return JB_ERR_OK;
 837
 838 }
 839
 840
 841 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 842 /*********************************************************************
 843  *
 844  * Function    :  compile_host_pattern
 845  *
 846  * Description :  Parses and compiles a host pattern..
 847  *
 848  * Parameters  :
 849  *          1  :  url = Target url_spec to be filled in.
 850  *          2  :  host_pattern = Host pattern to compile.
 851  *
 852  * Returns     :  JB_ERR_OK - Success
 853  *                JB_ERR_MEMORY - Out of memory
 854  *                JB_ERR_PARSE - Cannot parse regex
 855  *
 856  *********************************************************************/
 857 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 858 {
 859    return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
 860 }
 861
 862 #else
 863
 864 /*********************************************************************
 865  *
 866  * Function    :  compile_host_pattern
 867  *
 868  * Description :  Parses and "compiles" an old-school host pattern.
 869  *
 870  * Parameters  :
 871  *          1  :  url = Target url_spec to be filled in.
 872  *          2  :  host_pattern = Host pattern to parse.
 873  *
 874  * Returns     :  JB_ERR_OK - Success
 875  *                JB_ERR_MEMORY - Out of memory
 876  *                JB_ERR_PARSE - Cannot parse regex
 877  *
 878  *********************************************************************/
 879 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 880 {
 881    char *v[150];
 882    size_t size;
 883    char *p;
 884
 885    /*
 886     * Parse domain part
 887     */
 888    if (host_pattern[strlen(host_pattern) - 1] == '.')
 889    {
 890       url->unanchored |= ANCHOR_RIGHT;
 891    }
 892    if (host_pattern[0] == '.')
 893    {
 894       url->unanchored |= ANCHOR_LEFT;
 895    }
 896
 897    /*
 898     * Split domain into components
 899     */
 900    url->dbuffer = strdup(host_pattern);
 901    if (NULL == url->dbuffer)
 902    {
 903       free_url_spec(url);
 904       return JB_ERR_MEMORY;
 905    }
 906
 907    /*
 908     * Map to lower case
 909     */
 910    for (p = url->dbuffer; *p ; p++)
 911    {
 912       *p = (char)tolower((int)(unsigned char)*p);
 913    }
 914
 915    /*
 916     * Split the domain name into components
 917     */
 918    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 919
 920    if (url->dcount < 0)
 921    {
 922       free_url_spec(url);
 923       return JB_ERR_MEMORY;
 924    }
 925    else if (url->dcount != 0)
 926    {
 927       /*
 928        * Save a copy of the pointers in dvec
 929        */
 930       size = (size_t)url->dcount * sizeof(*url->dvec);
 931
 932       url->dvec = (char **)malloc(size);
 933       if (NULL == url->dvec)
 934       {
 935          free_url_spec(url);
 936          return JB_ERR_MEMORY;
 937       }
 938
 939       memcpy(url->dvec, v, size);
 940    }
 941    /*
 942     * else dcount == 0 in which case we needn't do anything,
 943     * since dvec will never be accessed and the pattern will
 944     * match all domains.
 945     */
 946    return JB_ERR_OK;
 947 }
 948
 949
 950 /*********************************************************************
 951  *
 952  * Function    :  simplematch
 953  *
 954  * Description :  String matching, with a (greedy) '*' wildcard that
 955  *                stands for zero or more arbitrary characters and
 956  *                character classes in [], which take both enumerations
 957  *                and ranges.
 958  *
 959  * Parameters  :
 960  *          1  :  pattern = pattern for matching
 961  *          2  :  text    = text to be matched
 962  *
 963  * Returns     :  0 if match, else nonzero
 964  *
 965  *********************************************************************/
 966 static int simplematch(const char *pattern, const char *text)
 967 {
 968    const unsigned char *pat = (const unsigned char *)pattern;
 969    const unsigned char *txt = (const unsigned char *)text;
 970    const unsigned char *fallback = pat;
 971    int wildcard = 0;
 972
 973    unsigned char lastchar = 'a';
 974    unsigned i;
 975    unsigned char charmap[32];
 976
 977    while (*txt)
 978    {
 979
 980       /* EOF pattern but !EOF text? */
 981       if (*pat == '\0')
 982       {
 983          if (wildcard)
 984          {
 985             pat = fallback;
 986          }
 987          else
 988          {
 989             return 1;
 990          }
 991       }
 992
 993       /* '*' in the pattern?  */
 994       if (*pat == '*')
 995       {
 996
 997          /* The pattern ends afterwards? Speed up the return. */
 998          if (*++pat == '\0')
 999          {
1000             return 0;
1001          }
1002
1003          /* Else, set wildcard mode and remember position after '*' */
1004          wildcard = 1;
1005          fallback = pat;
1006       }
1007
1008       /* Character range specification? */
1009       if (*pat == '[')
1010       {
1011          memset(charmap, '\0', sizeof(charmap));
1012
1013          while (*++pat != ']')
1014          {
1015             if (!*pat)
1016             {
1017                return 1;
1018             }
1019             else if (*pat == '-')
1020             {
1021                if ((*++pat == ']') || *pat == '\0')
1022                {
1023                   return(1);
1024                }
1025                for (i = lastchar; i <= *pat; i++)
1026                {
1027                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
1028                }
1029             }
1030             else
1031             {
1032                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
1033                lastchar = *pat;
1034             }
1035          }
1036       } /* -END- if Character range specification */
1037
1038
1039       /*
1040        * Char match, or char range match?
1041        */
1042       if ( (*pat == *txt)
1043       ||   (*pat == '?')
1044       ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
1045       {
1046          /*
1047           * Sucess: Go ahead
1048           */
1049          pat++;
1050       }
1051       else if (!wildcard)
1052       {
1053          /*
1054           * No match && no wildcard: No luck
1055           */
1056          return 1;
1057       }
1058       else if (pat != fallback)
1059       {
1060          /*
1061           * Increment text pointer if in char range matching
1062           */
1063          if (*pat == ']')
1064          {
1065             txt++;
1066          }
1067          /*
1068           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
1069           */
1070          pat = fallback;
1071          /*
1072           * Restart matching from current text pointer
1073           */
1074          continue;
1075       }
1076       txt++;
1077    }
1078
1079    /* Cut off extra '*'s */
1080    if(*pat == '*')  pat++;
1081
1082    /* If this is the pattern's end, fine! */
1083    return(*pat);
1084
1085 }
1086
1087
1088 /*********************************************************************
1089  *
1090  * Function    :  simple_domaincmp
1091  *
1092  * Description :  Domain-wise Compare fqdn's.  The comparison is
1093  *                both left- and right-anchored.  The individual
1094  *                domain names are compared with simplematch().
1095  *                This is only used by domain_match.
1096  *
1097  * Parameters  :
1098  *          1  :  pv = array of patterns to compare
1099  *          2  :  fv = array of domain components to compare
1100  *          3  :  len = length of the arrays (both arrays are the
1101  *                      same length - if they weren't, it couldn't
1102  *                      possibly be a match).
1103  *
1104  * Returns     :  0 => domains are equivalent, else no match.
1105  *
1106  *********************************************************************/
1107 static int simple_domaincmp(char **pv, char **fv, int len)
1108 {
1109    int n;
1110
1111    for (n = 0; n < len; n++)
1112    {
1113       if (simplematch(pv[n], fv[n]))
1114       {
1115          return 1;
1116       }
1117    }
1118
1119    return 0;
1120
1121 }
1122
1123
1124 /*********************************************************************
1125  *
1126  * Function    :  domain_match
1127  *
1128  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1129  *                pattern->unachored, the comparison is un-, left-,
1130  *                right-anchored, or both.
1131  *                The individual domain names are compared with
1132  *                simplematch().
1133  *
1134  * Parameters  :
1135  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1136  *          2  :  fqdn = domain name against which the patterns are compared.
1137  *
1138  * Returns     :  0 => domains are equivalent, else no match.
1139  *
1140  *********************************************************************/
1141 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1142 {
1143    char **pv, **fv;  /* vectors  */
1144    int    plen, flen;
1145    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1146
1147    plen = pattern->dcount;
1148    flen = fqdn->dcount;
1149
1150    if (flen < plen)
1151    {
1152       /* fqdn is too short to match this pattern */
1153       return 1;
1154    }
1155
1156    pv   = pattern->dvec;
1157    fv   = fqdn->dvec;
1158
1159    if (unanchored == ANCHOR_LEFT)
1160    {
1161       /*
1162        * Right anchored.
1163        *
1164        * Convert this into a fully anchored pattern with
1165        * the fqdn and pattern the same length
1166        */
1167       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1168       return simple_domaincmp(pv, fv, plen);
1169    }
1170    else if (unanchored == 0)
1171    {
1172       /* Fully anchored, check length */
1173       if (flen != plen)
1174       {
1175          return 1;
1176       }
1177       return simple_domaincmp(pv, fv, plen);
1178    }
1179    else if (unanchored == ANCHOR_RIGHT)
1180    {
1181       /* Left anchored, ignore all extra in fqdn */
1182       return simple_domaincmp(pv, fv, plen);
1183    }
1184    else
1185    {
1186       /* Unanchored */
1187       int n;
1188       int maxn = flen - plen;
1189       for (n = 0; n <= maxn; n++)
1190       {
1191          if (!simple_domaincmp(pv, fv, plen))
1192          {
1193             return 0;
1194          }
1195          /*
1196           * Doesn't match from start of fqdn
1197           * Try skipping first part of fqdn
1198           */
1199          fv++;
1200       }
1201       return 1;
1202    }
1203
1204 }
1205 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1206
1207
1208 /*********************************************************************
1209  *
1210  * Function    :  create_url_spec
1211  *
1212  * Description :  Creates a "url_spec" structure from a string.
1213  *                When finished, free with free_url_spec().
1214  *
1215  * Parameters  :
1216  *          1  :  url = Target url_spec to be filled in.  Will be
1217  *                      zeroed before use.
1218  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1219  *                      contents of this buffer are destroyed by this
1220  *                      function.  If this function succeeds, the
1221  *                      buffer is copied to url->spec.  If this
1222  *                      function fails, the contents of the buffer
1223  *                      are lost forever.
1224  *
1225  * Returns     :  JB_ERR_OK - Success
1226  *                JB_ERR_MEMORY - Out of memory
1227  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1228  *                               written to system log)
1229  *
1230  *********************************************************************/
1231 jb_err create_url_spec(struct url_spec *url, char *buf)
1232 {
1233    assert(url);
1234    assert(buf);
1235
1236    memset(url, '\0', sizeof(*url));
1237
1238    /* Remember the original specification for the CGI pages. */
1239    url->spec = strdup(buf);
1240    if (NULL == url->spec)
1241    {
1242       return JB_ERR_MEMORY;
1243    }
1244
1245    /* Is it tag pattern? */
1246    if (0 == strncmpic("TAG:", url->spec, 4))
1247    {
1248       /* The pattern starts with the first character after "TAG:" */
1249       const char *tag_pattern = buf + 4;
1250       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1251    }
1252
1253    /* If it isn't a tag pattern it must be a URL pattern. */
1254    return compile_url_pattern(url, buf);
1255 }
1256
1257
1258 /*********************************************************************
1259  *
1260  * Function    :  free_url_spec
1261  *
1262  * Description :  Called from the "unloaders".  Freez the url
1263  *                structure elements.
1264  *
1265  * Parameters  :
1266  *          1  :  url = pointer to a url_spec structure.
1267  *
1268  * Returns     :  N/A
1269  *
1270  *********************************************************************/
1271 void free_url_spec(struct url_spec *url)
1272 {
1273    if (url == NULL) return;
1274
1275    freez(url->spec);
1276 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1277    if (url->host_regex)
1278    {
1279       regfree(url->host_regex);
1280       freez(url->host_regex);
1281    }
1282 #else
1283    freez(url->dbuffer);
1284    freez(url->dvec);
1285    url->dcount = 0;
1286 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1287    freez(url->port_list);
1288    if (url->preg)
1289    {
1290       regfree(url->preg);
1291       freez(url->preg);
1292    }
1293    if (url->tag_regex)
1294    {
1295       regfree(url->tag_regex);
1296       freez(url->tag_regex);
1297    }
1298 }
1299
1300
1301 /*********************************************************************
1302  *
1303  * Function    :  url_match
1304  *
1305  * Description :  Compare a URL against a URL pattern.
1306  *
1307  * Parameters  :
1308  *          1  :  pattern = a URL pattern
1309  *          2  :  url = URL to match
1310  *
1311  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1312  *
1313  *********************************************************************/
1314 int url_match(const struct url_spec *pattern,
1315               const struct http_request *http)
1316 {
1317    /* XXX: these should probably be functions. */
1318 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
1319 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1320 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
1321 #else
1322 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
1323 #endif
1324 #define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
1325
1326    if (pattern->tag_regex != NULL)
1327    {
1328       /* It's a tag pattern and shouldn't be matched against URLs */
1329       return 0;
1330    }
1331
1332    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1333
1334 }
1335
1336
1337 /*********************************************************************
1338  *
1339  * Function    :  match_portlist
1340  *
1341  * Description :  Check if a given number is covered by a comma
1342  *                separated list of numbers and ranges (a,b-c,d,..)
1343  *
1344  * Parameters  :
1345  *          1  :  portlist = String with list
1346  *          2  :  port = port to check
1347  *
1348  * Returns     :  0 => no match
1349  *                1 => match
1350  *
1351  *********************************************************************/
1352 int match_portlist(const char *portlist, int port)
1353 {
1354    char *min, *max, *next, *portlist_copy;
1355
1356    min = next = portlist_copy = strdup(portlist);
1357
1358    /*
1359     * Zero-terminate first item and remember offset for next
1360     */
1361    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1362    {
1363       *next++ = '\0';
1364    }
1365
1366    /*
1367     * Loop through all items, checking for match
1368     */
1369    while(min)
1370    {
1371       if (NULL == (max = strchr(min, (int) '-')))
1372       {
1373          /*
1374           * No dash, check for equality
1375           */
1376          if (port == atoi(min))
1377          {
1378             free(portlist_copy);
1379             return(1);
1380          }
1381       }
1382       else
1383       {
1384          /*
1385           * This is a range, so check if between min and max,
1386           * or, if max was omitted, between min and 65K
1387           */
1388          *max++ = '\0';
1389          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1390          {
1391             free(portlist_copy);
1392             return(1);
1393          }
1394
1395       }
1396
1397       /*
1398        * Jump to next item
1399        */
1400       min = next;
1401
1402       /*
1403        * Zero-terminate next item and remember offset for n+1
1404        */
1405       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1406       {
1407          *next++ = '\0';
1408       }
1409    }
1410
1411    free(portlist_copy);
1412    return 0;
1413
1414 }
1415
1416
1417 /*
1418   Local Variables:
1419   tab-width: 3
1420   end:
1421 */