urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.31 2008/04/10 14:41:04 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2003, 2006-2008 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.31  2008/04/10 14:41:04  fabiankeil
  37  *    Ditch url_spec's path member now that it's no longer used.
  38  *
  39  *    Revision 1.30  2008/04/10 04:24:24  fabiankeil
  40  *    Stop duplicating the plain text representation of the path regex
  41  *    (and keeping the copy around). Once the regex is compiled it's no
  42  *    longer useful.
  43  *
  44  *    Revision 1.29  2008/04/10 04:17:56  fabiankeil
  45  *    In url_match(), check the right member for NULL when determining
  46  *    whether there's a path regex to execute. Looking for a plain-text
  47  *    representation works as well, but it looks "interesting" and that
  48  *    member will be removed soonish anyway.
  49  *
  50  *    Revision 1.28  2008/04/08 16:07:39  fabiankeil
  51  *    Make it harder to mistake url_match()'s
  52  *    second parameter for an url_spec.
  53  *
  54  *    Revision 1.27  2008/04/08 15:44:33  fabiankeil
  55  *    Save a bit of memory (and a few cpu cycles) by not bothering to
  56  *    compile slash-only path regexes that don't affect the result.
  57  *
  58  *    Revision 1.26  2008/04/07 16:57:18  fabiankeil
  59  *    - Use free_url_spec() more consistently.
  60  *    - Let it reset url->dcount just in case.
  61  *
  62  *    Revision 1.25  2008/04/06 15:18:38  fabiankeil
  63  *    Oh well, rename the --enable-pcre-host-patterns option to
  64  *    --enable-extended-host-patterns as it's not really PCRE syntax.
  65  *
  66  *    Revision 1.24  2008/04/06 14:54:26  fabiankeil
  67  *    Use PCRE syntax in host patterns when configured
  68  *    with --enable-pcre-host-patterns.
  69  *
  70  *    Revision 1.23  2008/04/05 12:19:20  fabiankeil
  71  *    Factor compile_host_pattern() out of create_url_spec().
  72  *
  73  *    Revision 1.22  2008/03/30 15:02:32  fabiankeil
  74  *    SZitify unknown_method().
  75  *
  76  *    Revision 1.21  2007/12/24 16:34:23  fabiankeil
  77  *    Band-aid (and micro-optimization) that makes it less likely to run out of
  78  *    stack space with overly-complex path patterns. Probably masks the problem
  79  *    reported by Lee in #1856679. Hohoho.
  80  *
  81  *    Revision 1.20  2007/09/02 15:31:20  fabiankeil
  82  *    Move match_portlist() from filter.c to urlmatch.c.
  83  *    It's used for url matching, not for filtering.
  84  *
  85  *    Revision 1.19  2007/09/02 13:42:11  fabiankeil
  86  *    - Allow port lists in url patterns.
  87  *    - Ditch unused url_spec member pathlen.
  88  *
  89  *    Revision 1.18  2007/07/30 16:42:21  fabiankeil
  90  *    Move the method check into unknown_method()
  91  *    and loop through the known methods instead
  92  *    of using a screen-long OR chain.
  93  *
  94  *    Revision 1.17  2007/04/15 16:39:21  fabiankeil
  95  *    Introduce tags as alternative way to specify which
  96  *    actions apply to a request. At the moment tags can be
  97  *    created based on client and server headers.
  98  *
  99  *    Revision 1.16  2007/02/13 13:59:24  fabiankeil
 100  *    Remove redundant log message.
 101  *
 102  *    Revision 1.15  2007/01/28 16:11:23  fabiankeil
 103  *    Accept WebDAV methods for subversion
 104  *    in parse_http_request(). Closes FR 1581425.
 105  *
 106  *    Revision 1.14  2007/01/06 14:23:56  fabiankeil
 107  *    Fix gcc43 warnings. Mark *csp as immutable
 108  *    for parse_http_url() and url_match().
 109  *    Replace a sprintf call with snprintf.
 110  *
 111  *    Revision 1.13  2006/12/06 19:50:54  fabiankeil
 112  *    parse_http_url() now handles intercepted
 113  *    HTTP request lines as well. Moved parts
 114  *    of parse_http_url()'s code into
 115  *    init_domain_components() so that it can
 116  *    be reused in chat().
 117  *
 118  *    Revision 1.12  2006/07/18 14:48:47  david__schmidt
 119  *    Reorganizing the repository: swapping out what was HEAD (the old 3.1 branch)
 120  *    with what was really the latest development (the v_3_0_branch branch)
 121  *
 122  *    Revision 1.10.2.7  2003/05/17 15:57:24  oes
 123  *     - parse_http_url now checks memory allocation failure for
 124  *       duplication of "*" URL and rejects "*something" URLs
 125  *       Closes bug #736344
 126  *     - Added a comment to what might look like a bug in
 127  *       create_url_spec (see !bug #736931)
 128  *     - Comment cosmetics
 129  *
 130  *    Revision 1.10.2.6  2003/05/07 12:39:48  oes
 131  *    Fix typo: Default port for https URLs is 443, not 143.
 132  *    Thanks to Scott Tregear for spotting this one.
 133  *
 134  *    Revision 1.10.2.5  2003/02/28 13:09:29  oes
 135  *    Fixed a rare double free condition as per Bug #694713
 136  *
 137  *    Revision 1.10.2.4  2003/02/28 12:57:44  oes
 138  *    Moved freeing of http request structure to its owner
 139  *    as per Dan Price's observations in Bug #694713
 140  *
 141  *    Revision 1.10.2.3  2002/11/12 16:50:40  oes
 142  *    Fixed memory leak in parse_http_request() reported by Oliver Stoeneberg. Fixes bug #637073
 143  *
 144  *    Revision 1.10.2.2  2002/09/25 14:53:15  oes
 145  *    Added basic support for OPTIONS and TRACE HTTP methods:
 146  *    parse_http_url now recognizes the "*" URI as well as
 147  *    the OPTIONS and TRACE method keywords.
 148  *
 149  *    Revision 1.10.2.1  2002/06/06 19:06:44  jongfoster
 150  *    Adding support for proprietary Microsoft WebDAV extensions
 151  *
 152  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
 153  *    - Removing some unused code
 154  *
 155  *    Revision 1.9  2002/04/04 00:36:36  gliptak
 156  *    always use pcre for matching
 157  *
 158  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
 159  *    Fixing memory leak on error
 160  *
 161  *    Revision 1.7  2002/03/26 22:29:55  swa
 162  *    we have a new homepage!
 163  *
 164  *    Revision 1.6  2002/03/24 13:25:43  swa
 165  *    name change related issues
 166  *
 167  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
 168  *    Killing warnings
 169  *
 170  *    Revision 1.4  2002/03/07 03:46:17  oes
 171  *    Fixed compiler warnings
 172  *
 173  *    Revision 1.3  2002/03/03 14:51:11  oes
 174  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
 175  *
 176  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
 177  *    Correcting comment style
 178  *    Fixing an uninitialized memory bug in create_url_spec()
 179  *
 180  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
 181  *    Moving all our URL and URL pattern parsing code to the same file - it
 182  *    was scattered around in filters.c, loaders.c and parsers.c.
 183  *
 184  *    Providing a single, simple url_match(pattern,url) function - rather than
 185  *    the 3-line match routine which was repeated all over the place.
 186  *
 187  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
 188  *
 189  *    Providing parse_http_url() so that URLs can be parsed without faking a
 190  *    HTTP request line for parse_http_request() or repeating the parsing
 191  *    code (both of which were techniques that were actually in use).
 192  *
 193  *    Standardizing that struct http_request is used to represent a URL, and
 194  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
 195  *    represented as seperate variables and a partially-filled-in url_spec).
 196  *
 197  *
 198  *********************************************************************/
 199 \f
 200
 201 #include "config.h"
 202
 203 #ifndef _WIN32
 204 #include <stdio.h>
 205 #include <sys/types.h>
 206 #endif
 207
 208 #include <stdlib.h>
 209 #include <ctype.h>
 210 #include <assert.h>
 211 #include <string.h>
 212
 213 #if !defined(_WIN32) && !defined(__OS2__)
 214 #include <unistd.h>
 215 #endif
 216
 217 #include "project.h"
 218 #include "urlmatch.h"
 219 #include "ssplit.h"
 220 #include "miscutil.h"
 221 #include "errlog.h"
 222
 223 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 224
 225 enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
 226
 227 /*********************************************************************
 228  *
 229  * Function    :  free_http_request
 230  *
 231  * Description :  Freez a http_request structure
 232  *
 233  * Parameters  :
 234  *          1  :  http = points to a http_request structure to free
 235  *
 236  * Returns     :  N/A
 237  *
 238  *********************************************************************/
 239 void free_http_request(struct http_request *http)
 240 {
 241    assert(http);
 242
 243    freez(http->cmd);
 244    freez(http->ocmd);
 245    freez(http->gpc);
 246    freez(http->host);
 247    freez(http->url);
 248    freez(http->hostport);
 249    freez(http->path);
 250    freez(http->ver);
 251    freez(http->host_ip_addr_str);
 252    freez(http->dbuffer);
 253    freez(http->dvec);
 254    http->dcount = 0;
 255 }
 256
 257
 258 /*********************************************************************
 259  *
 260  * Function    :  init_domain_components
 261  *
 262  * Description :  Splits the domain name so we can compare it
 263  *                against wildcards. It used to be part of
 264  *                parse_http_url, but was separated because the
 265  *                same code is required in chat in case of
 266  *                intercepted requests.
 267  *
 268  * Parameters  :
 269  *          1  :  http = pointer to the http structure to hold elements.
 270  *
 271  * Returns     :  JB_ERR_OK on success
 272  *                JB_ERR_MEMORY on out of memory
 273  *                JB_ERR_PARSE on malformed command/URL
 274  *                             or >100 domains deep.
 275  *
 276  *********************************************************************/
 277 jb_err init_domain_components(struct http_request *http)
 278 {
 279    char *vec[BUFFER_SIZE];
 280    size_t size;
 281    char *p;
 282
 283    http->dbuffer = strdup(http->host);
 284    if (NULL == http->dbuffer)
 285    {
 286       return JB_ERR_MEMORY;
 287    }
 288
 289    /* map to lower case */
 290    for (p = http->dbuffer; *p ; p++)
 291    {
 292       *p = (char)tolower((int)(unsigned char)*p);
 293    }
 294
 295    /* split the domain name into components */
 296    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 297
 298    if (http->dcount <= 0)
 299    {
 300       /*
 301        * Error: More than SZ(vec) components in domain
 302        *    or: no components in domain
 303        */
 304       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 305       return JB_ERR_PARSE;
 306    }
 307
 308    /* save a copy of the pointers in dvec */
 309    size = (size_t)http->dcount * sizeof(*http->dvec);
 310
 311    http->dvec = (char **)malloc(size);
 312    if (NULL == http->dvec)
 313    {
 314       return JB_ERR_MEMORY;
 315    }
 316
 317    memcpy(http->dvec, vec, size);
 318
 319    return JB_ERR_OK;
 320 }
 321
 322
 323 /*********************************************************************
 324  *
 325  * Function    :  parse_http_url
 326  *
 327  * Description :  Parse out the host and port from the URL.  Find the
 328  *                hostname & path, port (if ':'), and/or password (if '@')
 329  *
 330  * Parameters  :
 331  *          1  :  url = URL (or is it URI?) to break down
 332  *          2  :  http = pointer to the http structure to hold elements.
 333  *                       Will be zeroed before use.  Note that this
 334  *                       function sets the http->gpc and http->ver
 335  *                       members to NULL.
 336  *          3  :  csp = Current client state (buffers, headers, etc...)
 337  *
 338  * Returns     :  JB_ERR_OK on success
 339  *                JB_ERR_MEMORY on out of memory
 340  *                JB_ERR_PARSE on malformed command/URL
 341  *                             or >100 domains deep.
 342  *
 343  *********************************************************************/
 344 jb_err parse_http_url(const char * url,
 345                       struct http_request *http,
 346                       const struct client_state *csp)
 347 {
 348    int host_available = 1; /* A proxy can dream. */
 349
 350    /*
 351     * Zero out the results structure
 352     */
 353    memset(http, '\0', sizeof(*http));
 354
 355
 356    /*
 357     * Save our initial URL
 358     */
 359    http->url = strdup(url);
 360    if (http->url == NULL)
 361    {
 362       return JB_ERR_MEMORY;
 363    }
 364
 365
 366    /*
 367     * Check for * URI. If found, we're done.
 368     */
 369    if (*http->url == '*')
 370    {
 371       if  ( NULL == (http->path = strdup("*"))
 372          || NULL == (http->hostport = strdup("")) )
 373       {
 374          return JB_ERR_MEMORY;
 375       }
 376       if (http->url[1] != '\0')
 377       {
 378          return JB_ERR_PARSE;
 379       }
 380       return JB_ERR_OK;
 381    }
 382
 383
 384    /*
 385     * Split URL into protocol,hostport,path.
 386     */
 387    {
 388       char *buf;
 389       char *url_noproto;
 390       char *url_path;
 391
 392       buf = strdup(url);
 393       if (buf == NULL)
 394       {
 395          return JB_ERR_MEMORY;
 396       }
 397
 398       /* Find the start of the URL in our scratch space */
 399       url_noproto = buf;
 400       if (strncmpic(url_noproto, "http://",  7) == 0)
 401       {
 402          url_noproto += 7;
 403          http->ssl = 0;
 404       }
 405       else if (strncmpic(url_noproto, "https://", 8) == 0)
 406       {
 407          url_noproto += 8;
 408          http->ssl = 1;
 409       }
 410       else if (*url_noproto == '/')
 411       {
 412         /*
 413          * Short request line without protocol and host.
 414          * Most likely because the client's request
 415          * was intercepted and redirected into Privoxy.
 416          */
 417          http->ssl = 0;
 418          http->host = NULL;
 419          host_available = 0;
 420       }
 421       else
 422       {
 423          http->ssl = 0;
 424       }
 425
 426       url_path = strchr(url_noproto, '/');
 427       if (url_path != NULL)
 428       {
 429          /*
 430           * Got a path.
 431           *
 432           * NOTE: The following line ignores the path for HTTPS URLS.
 433           * This means that you get consistent behaviour if you type a
 434           * https URL in and it's parsed by the function.  (When the
 435           * URL is actually retrieved, SSL hides the path part).
 436           */
 437          http->path = strdup(http->ssl ? "/" : url_path);
 438          *url_path = '\0';
 439          http->hostport = strdup(url_noproto);
 440       }
 441       else
 442       {
 443          /*
 444           * Repair broken HTTP requests that don't contain a path,
 445           * or CONNECT requests
 446           */
 447          http->path = strdup("/");
 448          http->hostport = strdup(url_noproto);
 449       }
 450
 451       freez(buf);
 452
 453       if ( (http->path == NULL)
 454         || (http->hostport == NULL))
 455       {
 456          return JB_ERR_MEMORY;
 457       }
 458    }
 459
 460    if (!host_available)
 461    {
 462       /* Without host, there is nothing left to do here */
 463       return JB_ERR_OK;
 464    }
 465
 466    /*
 467     * Split hostport into user/password (ignored), host, port.
 468     */
 469    {
 470       char *buf;
 471       char *host;
 472       char *port;
 473
 474       buf = strdup(http->hostport);
 475       if (buf == NULL)
 476       {
 477          return JB_ERR_MEMORY;
 478       }
 479
 480       /* check if url contains username and/or password */
 481       host = strchr(buf, '@');
 482       if (host != NULL)
 483       {
 484          /* Contains username/password, skip it and the @ sign. */
 485          host++;
 486       }
 487       else
 488       {
 489          /* No username or password. */
 490          host = buf;
 491       }
 492
 493       /* check if url contains port */
 494       port = strchr(host, ':');
 495       if (port != NULL)
 496       {
 497          /* Contains port */
 498          /* Terminate hostname and point to start of port string */
 499          *port++ = '\0';
 500          http->port = atoi(port);
 501       }
 502       else
 503       {
 504          /* No port specified. */
 505          http->port = (http->ssl ? 443 : 80);
 506       }
 507
 508       http->host = strdup(host);
 509
 510       free(buf);
 511
 512       if (http->host == NULL)
 513       {
 514          return JB_ERR_MEMORY;
 515       }
 516    }
 517
 518    /*
 519     * Split domain name so we can compare it against wildcards
 520     */
 521    return init_domain_components(http);
 522
 523 }
 524
 525
 526 /*********************************************************************
 527  *
 528  * Function    :  unknown_method
 529  *
 530  * Description :  Checks whether a method is unknown.
 531  *
 532  * Parameters  :
 533  *          1  :  method = points to a http method
 534  *
 535  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 536  *
 537  *********************************************************************/
 538 static int unknown_method(const char *method)
 539 {
 540    static const char *known_http_methods[] = {
 541       /* Basic HTTP request type */
 542       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 543       /* webDAV extensions (RFC2518) */
 544       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 545       /*
 546        * Microsoft webDAV extension for Exchange 2000.  See:
 547        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 548        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 549        */
 550       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 551       /*
 552        * Another Microsoft webDAV extension for Exchange 2000.  See:
 553        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 554        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 555        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 556        */
 557       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 558       /*
 559        * Yet another WebDAV extension, this time for
 560        * Web Distributed Authoring and Versioning (RFC3253)
 561        */
 562       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 563       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 564    };
 565    int i;
 566
 567    for (i = 0; i < SZ(known_http_methods); i++)
 568    {
 569       if (0 == strcmpic(method, known_http_methods[i]))
 570       {
 571          return FALSE;
 572       }
 573    }
 574
 575    return TRUE;
 576
 577 }
 578
 579
 580 /*********************************************************************
 581  *
 582  * Function    :  parse_http_request
 583  *
 584  * Description :  Parse out the host and port from the URL.  Find the
 585  *                hostname & path, port (if ':'), and/or password (if '@')
 586  *
 587  * Parameters  :
 588  *          1  :  req = HTTP request line to break down
 589  *          2  :  http = pointer to the http structure to hold elements
 590  *          3  :  csp = Current client state (buffers, headers, etc...)
 591  *
 592  * Returns     :  JB_ERR_OK on success
 593  *                JB_ERR_MEMORY on out of memory
 594  *                JB_ERR_CGI_PARAMS on malformed command/URL
 595  *                                  or >100 domains deep.
 596  *
 597  *********************************************************************/
 598 jb_err parse_http_request(const char *req,
 599                           struct http_request *http,
 600                           const struct client_state *csp)
 601 {
 602    char *buf;
 603    char *v[10]; /* XXX: Why 10? We should only need three. */
 604    int n;
 605    jb_err err;
 606    int is_connect = 0;
 607
 608    memset(http, '\0', sizeof(*http));
 609
 610    buf = strdup(req);
 611    if (buf == NULL)
 612    {
 613       return JB_ERR_MEMORY;
 614    }
 615
 616    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 617    if (n != 3)
 618    {
 619       free(buf);
 620       return JB_ERR_PARSE;
 621    }
 622
 623    /*
 624     * Fail in case of unknown methods
 625     * which we might not handle correctly.
 626     *
 627     * XXX: There should be a config option
 628     * to forward requests with unknown methods
 629     * anyway. Most of them don't need special
 630     * steps.
 631     */
 632    if (unknown_method(v[0]))
 633    {
 634       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 635       free(buf);
 636       return JB_ERR_PARSE;
 637    }
 638
 639    if (strcmpic(v[0], "CONNECT") == 0)
 640    {
 641       is_connect = 1;
 642    }
 643
 644    err = parse_http_url(v[1], http, csp);
 645    if (err)
 646    {
 647       free(buf);
 648       return err;
 649    }
 650
 651    /*
 652     * Copy the details into the structure
 653     */
 654    http->ssl = is_connect;
 655    http->cmd = strdup(req);
 656    http->gpc = strdup(v[0]);
 657    http->ver = strdup(v[2]);
 658
 659    if ( (http->cmd == NULL)
 660      || (http->gpc == NULL)
 661      || (http->ver == NULL) )
 662    {
 663       free(buf);
 664       return JB_ERR_MEMORY;
 665    }
 666
 667    free(buf);
 668    return JB_ERR_OK;
 669
 670 }
 671
 672
 673 /*********************************************************************
 674  *
 675  * Function    :  compile_pattern
 676  *
 677  * Description :  Compiles a host, domain or TAG pattern.
 678  *
 679  * Parameters  :
 680  *          1  :  pattern = The pattern to compile.
 681  *          2  :  anchoring = How the regex should be anchored.
 682  *                            Can be either one of NO_ANCHORING,
 683  *                            LEFT_ANCHORED or RIGHT_ANCHORED.
 684  *          3  :  url     = In case of failures, the spec member is
 685  *                          logged and the structure freed.
 686  *          4  :  regex   = Where the compiled regex should be stored.
 687  *
 688  * Returns     :  JB_ERR_OK - Success
 689  *                JB_ERR_MEMORY - Out of memory
 690  *                JB_ERR_PARSE - Cannot parse regex
 691  *
 692  *********************************************************************/
 693 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 694                               struct url_spec *url, regex_t **regex)
 695 {
 696    int errcode;
 697    char rebuf[BUFFER_SIZE];
 698    const char *fmt;
 699
 700    assert(pattern);
 701    assert(strlen(pattern) < sizeof(rebuf) - 2);
 702
 703    if (pattern[0] == '\0')
 704    {
 705       *regex = NULL;
 706       return JB_ERR_OK;
 707    }
 708
 709    switch (anchoring)
 710    {
 711       case NO_ANCHORING:
 712          fmt = "%s";
 713          break;
 714       case RIGHT_ANCHORED:
 715          fmt = "%s$";
 716          break;
 717       case LEFT_ANCHORED:
 718          fmt = "^%s";
 719          break;
 720       default:
 721          log_error(LOG_LEVEL_FATAL,
 722             "Invalid anchoring in compile_pattern %d", anchoring);
 723    }
 724
 725    *regex = zalloc(sizeof(**regex));
 726    if (NULL == *regex)
 727    {
 728       free_url_spec(url);
 729       return JB_ERR_MEMORY;
 730    }
 731
 732    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 733
 734    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 735
 736    if (errcode)
 737    {
 738       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 739       if (errlen > (sizeof(rebuf) - (size_t)1))
 740       {
 741          errlen = sizeof(rebuf) - (size_t)1;
 742       }
 743       rebuf[errlen] = '\0';
 744       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 745          pattern, url->spec, rebuf);
 746       free_url_spec(url);
 747
 748       return JB_ERR_PARSE;
 749    }
 750
 751    return JB_ERR_OK;
 752
 753 }
 754
 755 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 756 /*********************************************************************
 757  *
 758  * Function    :  compile_host_pattern
 759  *
 760  * Description :  Parses and compiles a host pattern..
 761  *
 762  * Parameters  :
 763  *          1  :  url = Target url_spec to be filled in.
 764  *          2  :  host_pattern = Host pattern to compile.
 765  *
 766  * Returns     :  JB_ERR_OK - Success
 767  *                JB_ERR_MEMORY - Out of memory
 768  *                JB_ERR_PARSE - Cannot parse regex
 769  *
 770  *********************************************************************/
 771 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 772 {
 773    return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
 774 }
 775
 776 #else
 777
 778 /*********************************************************************
 779  *
 780  * Function    :  compile_host_pattern
 781  *
 782  * Description :  Parses and "compiles" an old-school host pattern.
 783  *
 784  * Parameters  :
 785  *          1  :  url = Target url_spec to be filled in.
 786  *          2  :  host_pattern = Host pattern to parse.
 787  *
 788  * Returns     :  JB_ERR_OK - Success
 789  *                JB_ERR_MEMORY - Out of memory
 790  *                JB_ERR_PARSE - Cannot parse regex
 791  *
 792  *********************************************************************/
 793 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 794 {
 795    char *v[150];
 796    size_t size;
 797    char *p;
 798
 799    /*
 800     * Parse domain part
 801     */
 802    if (host_pattern[strlen(host_pattern) - 1] == '.')
 803    {
 804       url->unanchored |= ANCHOR_RIGHT;
 805    }
 806    if (host_pattern[0] == '.')
 807    {
 808       url->unanchored |= ANCHOR_LEFT;
 809    }
 810
 811    /*
 812     * Split domain into components
 813     */
 814    url->dbuffer = strdup(host_pattern);
 815    if (NULL == url->dbuffer)
 816    {
 817       free_url_spec(url);
 818       return JB_ERR_MEMORY;
 819    }
 820
 821    /*
 822     * Map to lower case
 823     */
 824    for (p = url->dbuffer; *p ; p++)
 825    {
 826       *p = (char)tolower((int)(unsigned char)*p);
 827    }
 828
 829    /*
 830     * Split the domain name into components
 831     */
 832    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 833
 834    if (url->dcount < 0)
 835    {
 836       free_url_spec(url);
 837       return JB_ERR_MEMORY;
 838    }
 839    else if (url->dcount != 0)
 840    {
 841       /*
 842        * Save a copy of the pointers in dvec
 843        */
 844       size = (size_t)url->dcount * sizeof(*url->dvec);
 845
 846       url->dvec = (char **)malloc(size);
 847       if (NULL == url->dvec)
 848       {
 849          free_url_spec(url);
 850          return JB_ERR_MEMORY;
 851       }
 852
 853       memcpy(url->dvec, v, size);
 854    }
 855    /*
 856     * else dcount == 0 in which case we needn't do anything,
 857     * since dvec will never be accessed and the pattern will
 858     * match all domains.
 859     */
 860    return JB_ERR_OK;
 861 }
 862
 863
 864 /*********************************************************************
 865  *
 866  * Function    :  simple_domaincmp
 867  *
 868  * Description :  Domain-wise Compare fqdn's.  The comparison is
 869  *                both left- and right-anchored.  The individual
 870  *                domain names are compared with simplematch().
 871  *                This is only used by domain_match.
 872  *
 873  * Parameters  :
 874  *          1  :  pv = array of patterns to compare
 875  *          2  :  fv = array of domain components to compare
 876  *          3  :  len = length of the arrays (both arrays are the
 877  *                      same length - if they weren't, it couldn't
 878  *                      possibly be a match).
 879  *
 880  * Returns     :  0 => domains are equivalent, else no match.
 881  *
 882  *********************************************************************/
 883 static int simple_domaincmp(char **pv, char **fv, int len)
 884 {
 885    int n;
 886
 887    for (n = 0; n < len; n++)
 888    {
 889       if (simplematch(pv[n], fv[n]))
 890       {
 891          return 1;
 892       }
 893    }
 894
 895    return 0;
 896
 897 }
 898
 899
 900 /*********************************************************************
 901  *
 902  * Function    :  domain_match
 903  *
 904  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 905  *                pattern->unachored, the comparison is un-, left-,
 906  *                right-anchored, or both.
 907  *                The individual domain names are compared with
 908  *                simplematch().
 909  *
 910  * Parameters  :
 911  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 912  *          2  :  fqdn = domain name against which the patterns are compared.
 913  *
 914  * Returns     :  0 => domains are equivalent, else no match.
 915  *
 916  *********************************************************************/
 917 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 918 {
 919    char **pv, **fv;  /* vectors  */
 920    int    plen, flen;
 921    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 922
 923    plen = pattern->dcount;
 924    flen = fqdn->dcount;
 925
 926    if (flen < plen)
 927    {
 928       /* fqdn is too short to match this pattern */
 929       return 1;
 930    }
 931
 932    pv   = pattern->dvec;
 933    fv   = fqdn->dvec;
 934
 935    if (unanchored == ANCHOR_LEFT)
 936    {
 937       /*
 938        * Right anchored.
 939        *
 940        * Convert this into a fully anchored pattern with
 941        * the fqdn and pattern the same length
 942        */
 943       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 944       return simple_domaincmp(pv, fv, plen);
 945    }
 946    else if (unanchored == 0)
 947    {
 948       /* Fully anchored, check length */
 949       if (flen != plen)
 950       {
 951          return 1;
 952       }
 953       return simple_domaincmp(pv, fv, plen);
 954    }
 955    else if (unanchored == ANCHOR_RIGHT)
 956    {
 957       /* Left anchored, ignore all extra in fqdn */
 958       return simple_domaincmp(pv, fv, plen);
 959    }
 960    else
 961    {
 962       /* Unanchored */
 963       int n;
 964       int maxn = flen - plen;
 965       for (n = 0; n <= maxn; n++)
 966       {
 967          if (!simple_domaincmp(pv, fv, plen))
 968          {
 969             return 0;
 970          }
 971          /*
 972           * Doesn't match from start of fqdn
 973           * Try skipping first part of fqdn
 974           */
 975          fv++;
 976       }
 977       return 1;
 978    }
 979
 980 }
 981 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 982
 983
 984 /*********************************************************************
 985  *
 986  * Function    :  create_url_spec
 987  *
 988  * Description :  Creates a "url_spec" structure from a string.
 989  *                When finished, free with free_url_spec().
 990  *
 991  * Parameters  :
 992  *          1  :  url = Target url_spec to be filled in.  Will be
 993  *                      zeroed before use.
 994  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 995  *                      contents of this buffer are destroyed by this
 996  *                      function.  If this function succeeds, the
 997  *                      buffer is copied to url->spec.  If this
 998  *                      function fails, the contents of the buffer
 999  *                      are lost forever.
1000  *
1001  * Returns     :  JB_ERR_OK - Success
1002  *                JB_ERR_MEMORY - Out of memory
1003  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1004  *                               written to system log)
1005  *
1006  *********************************************************************/
1007 jb_err create_url_spec(struct url_spec * url, const char * buf)
1008 {
1009    char *p;
1010
1011    assert(url);
1012    assert(buf);
1013
1014    /*
1015     * Zero memory
1016     */
1017    memset(url, '\0', sizeof(*url));
1018
1019    /*
1020     * Save a copy of the orignal specification
1021     */
1022    if ((url->spec = strdup(buf)) == NULL)
1023    {
1024       return JB_ERR_MEMORY;
1025    }
1026
1027    /* Is it tag pattern? */
1028    if (0 == strncmpic("TAG:", url->spec, 4))
1029    {
1030       /* The pattern starts with the first character after "TAG:" */
1031       const char *tag_pattern = buf + 4;
1032       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1033    }
1034
1035    /* Only reached for URL patterns. XXX: should be factored out. */
1036    p = strchr(buf, '/');
1037    if (NULL != p)
1038    {
1039       /*
1040        * Only compile the regex if it consists of more than
1041        * a single slash, otherwise it wouldn't affect the result.
1042        */
1043       if (*(p+1) != '\0')
1044       {
1045          /*
1046           * XXX: does it make sense to compile the slash at the beginning?
1047           */
1048          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
1049
1050          if (JB_ERR_OK != err)
1051          {
1052             return err;
1053          }
1054       }
1055       *p = '\0';
1056    }
1057
1058    p = strchr(buf, ':');
1059    if (NULL != p)
1060    {
1061       *p++ = '\0';
1062       url->port_list = strdup(p);
1063       if (NULL == url->port_list)
1064       {
1065          return JB_ERR_MEMORY;
1066       }
1067    }
1068    else
1069    {
1070       url->port_list = NULL;
1071    }
1072
1073    if (buf[0] != '\0')
1074    {
1075       return compile_host_pattern(url, buf);
1076    }
1077
1078    return JB_ERR_OK;
1079
1080 }
1081
1082
1083 /*********************************************************************
1084  *
1085  * Function    :  free_url_spec
1086  *
1087  * Description :  Called from the "unloaders".  Freez the url
1088  *                structure elements.
1089  *
1090  * Parameters  :
1091  *          1  :  url = pointer to a url_spec structure.
1092  *
1093  * Returns     :  N/A
1094  *
1095  *********************************************************************/
1096 void free_url_spec(struct url_spec *url)
1097 {
1098    if (url == NULL) return;
1099
1100    freez(url->spec);
1101 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1102    if (url->host_regex)
1103    {
1104       regfree(url->host_regex);
1105       freez(url->host_regex);
1106    }
1107 #else
1108    freez(url->dbuffer);
1109    freez(url->dvec);
1110    url->dcount = 0;
1111 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1112    freez(url->port_list);
1113    if (url->preg)
1114    {
1115       regfree(url->preg);
1116       freez(url->preg);
1117    }
1118    if (url->tag_regex)
1119    {
1120       regfree(url->tag_regex);
1121       freez(url->tag_regex);
1122    }
1123 }
1124
1125
1126 /*********************************************************************
1127  *
1128  * Function    :  url_match
1129  *
1130  * Description :  Compare a URL against a URL pattern.
1131  *
1132  * Parameters  :
1133  *          1  :  pattern = a URL pattern
1134  *          2  :  url = URL to match
1135  *
1136  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1137  *
1138  *********************************************************************/
1139 int url_match(const struct url_spec *pattern,
1140               const struct http_request *http)
1141 {
1142    /* XXX: these should probably be functions. */
1143 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
1144 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1145 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
1146 #else
1147 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
1148 #endif
1149 #define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
1150
1151    if (pattern->tag_regex != NULL)
1152    {
1153       /* It's a tag pattern and shouldn't be matched against URLs */
1154       return 0;
1155    }
1156
1157    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1158
1159 }
1160
1161
1162 /*********************************************************************
1163  *
1164  * Function    :  match_portlist
1165  *
1166  * Description :  Check if a given number is covered by a comma
1167  *                separated list of numbers and ranges (a,b-c,d,..)
1168  *
1169  * Parameters  :
1170  *          1  :  portlist = String with list
1171  *          2  :  port = port to check
1172  *
1173  * Returns     :  0 => no match
1174  *                1 => match
1175  *
1176  *********************************************************************/
1177 int match_portlist(const char *portlist, int port)
1178 {
1179    char *min, *max, *next, *portlist_copy;
1180
1181    min = next = portlist_copy = strdup(portlist);
1182
1183    /*
1184     * Zero-terminate first item and remember offset for next
1185     */
1186    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1187    {
1188       *next++ = '\0';
1189    }
1190
1191    /*
1192     * Loop through all items, checking for match
1193     */
1194    while(min)
1195    {
1196       if (NULL == (max = strchr(min, (int) '-')))
1197       {
1198          /*
1199           * No dash, check for equality
1200           */
1201          if (port == atoi(min))
1202          {
1203             free(portlist_copy);
1204             return(1);
1205          }
1206       }
1207       else
1208       {
1209          /*
1210           * This is a range, so check if between min and max,
1211           * or, if max was omitted, between min and 65K
1212           */
1213          *max++ = '\0';
1214          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1215          {
1216             free(portlist_copy);
1217             return(1);
1218          }
1219
1220       }
1221
1222       /*
1223        * Jump to next item
1224        */
1225       min = next;
1226
1227       /*
1228        * Zero-terminate next item and remember offset for n+1
1229        */
1230       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1231       {
1232          *next++ = '\0';
1233       }
1234    }
1235
1236    free(portlist_copy);
1237    return 0;
1238
1239 }
1240
1241
1242 /*
1243   Local Variables:
1244   tab-width: 3
1245   end:
1246 */