urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.24 2008/04/06 14:54:26 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2003, 2006-2008 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.24  2008/04/06 14:54:26  fabiankeil
  37  *    Use PCRE syntax in host patterns when configured
  38  *    with --enable-pcre-host-patterns.
  39  *
  40  *    Revision 1.23  2008/04/05 12:19:20  fabiankeil
  41  *    Factor compile_host_pattern() out of create_url_spec().
  42  *
  43  *    Revision 1.22  2008/03/30 15:02:32  fabiankeil
  44  *    SZitify unknown_method().
  45  *
  46  *    Revision 1.21  2007/12/24 16:34:23  fabiankeil
  47  *    Band-aid (and micro-optimization) that makes it less likely to run out of
  48  *    stack space with overly-complex path patterns. Probably masks the problem
  49  *    reported by Lee in #1856679. Hohoho.
  50  *
  51  *    Revision 1.20  2007/09/02 15:31:20  fabiankeil
  52  *    Move match_portlist() from filter.c to urlmatch.c.
  53  *    It's used for url matching, not for filtering.
  54  *
  55  *    Revision 1.19  2007/09/02 13:42:11  fabiankeil
  56  *    - Allow port lists in url patterns.
  57  *    - Ditch unused url_spec member pathlen.
  58  *
  59  *    Revision 1.18  2007/07/30 16:42:21  fabiankeil
  60  *    Move the method check into unknown_method()
  61  *    and loop through the known methods instead
  62  *    of using a screen-long OR chain.
  63  *
  64  *    Revision 1.17  2007/04/15 16:39:21  fabiankeil
  65  *    Introduce tags as alternative way to specify which
  66  *    actions apply to a request. At the moment tags can be
  67  *    created based on client and server headers.
  68  *
  69  *    Revision 1.16  2007/02/13 13:59:24  fabiankeil
  70  *    Remove redundant log message.
  71  *
  72  *    Revision 1.15  2007/01/28 16:11:23  fabiankeil
  73  *    Accept WebDAV methods for subversion
  74  *    in parse_http_request(). Closes FR 1581425.
  75  *
  76  *    Revision 1.14  2007/01/06 14:23:56  fabiankeil
  77  *    Fix gcc43 warnings. Mark *csp as immutable
  78  *    for parse_http_url() and url_match().
  79  *    Replace a sprintf call with snprintf.
  80  *
  81  *    Revision 1.13  2006/12/06 19:50:54  fabiankeil
  82  *    parse_http_url() now handles intercepted
  83  *    HTTP request lines as well. Moved parts
  84  *    of parse_http_url()'s code into
  85  *    init_domain_components() so that it can
  86  *    be reused in chat().
  87  *
  88  *    Revision 1.12  2006/07/18 14:48:47  david__schmidt
  89  *    Reorganizing the repository: swapping out what was HEAD (the old 3.1 branch)
  90  *    with what was really the latest development (the v_3_0_branch branch)
  91  *
  92  *    Revision 1.10.2.7  2003/05/17 15:57:24  oes
  93  *     - parse_http_url now checks memory allocation failure for
  94  *       duplication of "*" URL and rejects "*something" URLs
  95  *       Closes bug #736344
  96  *     - Added a comment to what might look like a bug in
  97  *       create_url_spec (see !bug #736931)
  98  *     - Comment cosmetics
  99  *
 100  *    Revision 1.10.2.6  2003/05/07 12:39:48  oes
 101  *    Fix typo: Default port for https URLs is 443, not 143.
 102  *    Thanks to Scott Tregear for spotting this one.
 103  *
 104  *    Revision 1.10.2.5  2003/02/28 13:09:29  oes
 105  *    Fixed a rare double free condition as per Bug #694713
 106  *
 107  *    Revision 1.10.2.4  2003/02/28 12:57:44  oes
 108  *    Moved freeing of http request structure to its owner
 109  *    as per Dan Price's observations in Bug #694713
 110  *
 111  *    Revision 1.10.2.3  2002/11/12 16:50:40  oes
 112  *    Fixed memory leak in parse_http_request() reported by Oliver Stoeneberg. Fixes bug #637073
 113  *
 114  *    Revision 1.10.2.2  2002/09/25 14:53:15  oes
 115  *    Added basic support for OPTIONS and TRACE HTTP methods:
 116  *    parse_http_url now recognizes the "*" URI as well as
 117  *    the OPTIONS and TRACE method keywords.
 118  *
 119  *    Revision 1.10.2.1  2002/06/06 19:06:44  jongfoster
 120  *    Adding support for proprietary Microsoft WebDAV extensions
 121  *
 122  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
 123  *    - Removing some unused code
 124  *
 125  *    Revision 1.9  2002/04/04 00:36:36  gliptak
 126  *    always use pcre for matching
 127  *
 128  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
 129  *    Fixing memory leak on error
 130  *
 131  *    Revision 1.7  2002/03/26 22:29:55  swa
 132  *    we have a new homepage!
 133  *
 134  *    Revision 1.6  2002/03/24 13:25:43  swa
 135  *    name change related issues
 136  *
 137  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
 138  *    Killing warnings
 139  *
 140  *    Revision 1.4  2002/03/07 03:46:17  oes
 141  *    Fixed compiler warnings
 142  *
 143  *    Revision 1.3  2002/03/03 14:51:11  oes
 144  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
 145  *
 146  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
 147  *    Correcting comment style
 148  *    Fixing an uninitialized memory bug in create_url_spec()
 149  *
 150  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
 151  *    Moving all our URL and URL pattern parsing code to the same file - it
 152  *    was scattered around in filters.c, loaders.c and parsers.c.
 153  *
 154  *    Providing a single, simple url_match(pattern,url) function - rather than
 155  *    the 3-line match routine which was repeated all over the place.
 156  *
 157  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
 158  *
 159  *    Providing parse_http_url() so that URLs can be parsed without faking a
 160  *    HTTP request line for parse_http_request() or repeating the parsing
 161  *    code (both of which were techniques that were actually in use).
 162  *
 163  *    Standardizing that struct http_request is used to represent a URL, and
 164  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
 165  *    represented as seperate variables and a partially-filled-in url_spec).
 166  *
 167  *
 168  *********************************************************************/
 169 \f
 170
 171 #include "config.h"
 172
 173 #ifndef _WIN32
 174 #include <stdio.h>
 175 #include <sys/types.h>
 176 #endif
 177
 178 #include <stdlib.h>
 179 #include <ctype.h>
 180 #include <assert.h>
 181 #include <string.h>
 182
 183 #if !defined(_WIN32) && !defined(__OS2__)
 184 #include <unistd.h>
 185 #endif
 186
 187 #include "project.h"
 188 #include "urlmatch.h"
 189 #include "ssplit.h"
 190 #include "miscutil.h"
 191 #include "errlog.h"
 192
 193 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 194
 195
 196 /*********************************************************************
 197  *
 198  * Function    :  free_http_request
 199  *
 200  * Description :  Freez a http_request structure
 201  *
 202  * Parameters  :
 203  *          1  :  http = points to a http_request structure to free
 204  *
 205  * Returns     :  N/A
 206  *
 207  *********************************************************************/
 208 void free_http_request(struct http_request *http)
 209 {
 210    assert(http);
 211
 212    freez(http->cmd);
 213    freez(http->ocmd);
 214    freez(http->gpc);
 215    freez(http->host);
 216    freez(http->url);
 217    freez(http->hostport);
 218    freez(http->path);
 219    freez(http->ver);
 220    freez(http->host_ip_addr_str);
 221    freez(http->dbuffer);
 222    freez(http->dvec);
 223    http->dcount = 0;
 224 }
 225
 226
 227 /*********************************************************************
 228  *
 229  * Function    :  init_domain_components
 230  *
 231  * Description :  Splits the domain name so we can compare it
 232  *                against wildcards. It used to be part of
 233  *                parse_http_url, but was separated because the
 234  *                same code is required in chat in case of
 235  *                intercepted requests.
 236  *
 237  * Parameters  :
 238  *          1  :  http = pointer to the http structure to hold elements.
 239  *
 240  * Returns     :  JB_ERR_OK on success
 241  *                JB_ERR_MEMORY on out of memory
 242  *                JB_ERR_PARSE on malformed command/URL
 243  *                             or >100 domains deep.
 244  *
 245  *********************************************************************/
 246 jb_err init_domain_components(struct http_request *http)
 247 {
 248    char *vec[BUFFER_SIZE];
 249    size_t size;
 250    char *p;
 251
 252    http->dbuffer = strdup(http->host);
 253    if (NULL == http->dbuffer)
 254    {
 255       return JB_ERR_MEMORY;
 256    }
 257
 258    /* map to lower case */
 259    for (p = http->dbuffer; *p ; p++)
 260    {
 261       *p = (char)tolower((int)(unsigned char)*p);
 262    }
 263
 264    /* split the domain name into components */
 265    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 266
 267    if (http->dcount <= 0)
 268    {
 269       /*
 270        * Error: More than SZ(vec) components in domain
 271        *    or: no components in domain
 272        */
 273       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 274       return JB_ERR_PARSE;
 275    }
 276
 277    /* save a copy of the pointers in dvec */
 278    size = (size_t)http->dcount * sizeof(*http->dvec);
 279
 280    http->dvec = (char **)malloc(size);
 281    if (NULL == http->dvec)
 282    {
 283       return JB_ERR_MEMORY;
 284    }
 285
 286    memcpy(http->dvec, vec, size);
 287
 288    return JB_ERR_OK;
 289 }
 290
 291
 292 /*********************************************************************
 293  *
 294  * Function    :  parse_http_url
 295  *
 296  * Description :  Parse out the host and port from the URL.  Find the
 297  *                hostname & path, port (if ':'), and/or password (if '@')
 298  *
 299  * Parameters  :
 300  *          1  :  url = URL (or is it URI?) to break down
 301  *          2  :  http = pointer to the http structure to hold elements.
 302  *                       Will be zeroed before use.  Note that this
 303  *                       function sets the http->gpc and http->ver
 304  *                       members to NULL.
 305  *          3  :  csp = Current client state (buffers, headers, etc...)
 306  *
 307  * Returns     :  JB_ERR_OK on success
 308  *                JB_ERR_MEMORY on out of memory
 309  *                JB_ERR_PARSE on malformed command/URL
 310  *                             or >100 domains deep.
 311  *
 312  *********************************************************************/
 313 jb_err parse_http_url(const char * url,
 314                       struct http_request *http,
 315                       const struct client_state *csp)
 316 {
 317    int host_available = 1; /* A proxy can dream. */
 318
 319    /*
 320     * Zero out the results structure
 321     */
 322    memset(http, '\0', sizeof(*http));
 323
 324
 325    /*
 326     * Save our initial URL
 327     */
 328    http->url = strdup(url);
 329    if (http->url == NULL)
 330    {
 331       return JB_ERR_MEMORY;
 332    }
 333
 334
 335    /*
 336     * Check for * URI. If found, we're done.
 337     */
 338    if (*http->url == '*')
 339    {
 340       if  ( NULL == (http->path = strdup("*"))
 341          || NULL == (http->hostport = strdup("")) )
 342       {
 343          return JB_ERR_MEMORY;
 344       }
 345       if (http->url[1] != '\0')
 346       {
 347          return JB_ERR_PARSE;
 348       }
 349       return JB_ERR_OK;
 350    }
 351
 352
 353    /*
 354     * Split URL into protocol,hostport,path.
 355     */
 356    {
 357       char *buf;
 358       char *url_noproto;
 359       char *url_path;
 360
 361       buf = strdup(url);
 362       if (buf == NULL)
 363       {
 364          return JB_ERR_MEMORY;
 365       }
 366
 367       /* Find the start of the URL in our scratch space */
 368       url_noproto = buf;
 369       if (strncmpic(url_noproto, "http://",  7) == 0)
 370       {
 371          url_noproto += 7;
 372          http->ssl = 0;
 373       }
 374       else if (strncmpic(url_noproto, "https://", 8) == 0)
 375       {
 376          url_noproto += 8;
 377          http->ssl = 1;
 378       }
 379       else if (*url_noproto == '/')
 380       {
 381         /*
 382          * Short request line without protocol and host.
 383          * Most likely because the client's request
 384          * was intercepted and redirected into Privoxy.
 385          */
 386          http->ssl = 0;
 387          http->host = NULL;
 388          host_available = 0;
 389       }
 390       else
 391       {
 392          http->ssl = 0;
 393       }
 394
 395       url_path = strchr(url_noproto, '/');
 396       if (url_path != NULL)
 397       {
 398          /*
 399           * Got a path.
 400           *
 401           * NOTE: The following line ignores the path for HTTPS URLS.
 402           * This means that you get consistent behaviour if you type a
 403           * https URL in and it's parsed by the function.  (When the
 404           * URL is actually retrieved, SSL hides the path part).
 405           */
 406          http->path = strdup(http->ssl ? "/" : url_path);
 407          *url_path = '\0';
 408          http->hostport = strdup(url_noproto);
 409       }
 410       else
 411       {
 412          /*
 413           * Repair broken HTTP requests that don't contain a path,
 414           * or CONNECT requests
 415           */
 416          http->path = strdup("/");
 417          http->hostport = strdup(url_noproto);
 418       }
 419
 420       freez(buf);
 421
 422       if ( (http->path == NULL)
 423         || (http->hostport == NULL))
 424       {
 425          return JB_ERR_MEMORY;
 426       }
 427    }
 428
 429    if (!host_available)
 430    {
 431       /* Without host, there is nothing left to do here */
 432       return JB_ERR_OK;
 433    }
 434
 435    /*
 436     * Split hostport into user/password (ignored), host, port.
 437     */
 438    {
 439       char *buf;
 440       char *host;
 441       char *port;
 442
 443       buf = strdup(http->hostport);
 444       if (buf == NULL)
 445       {
 446          return JB_ERR_MEMORY;
 447       }
 448
 449       /* check if url contains username and/or password */
 450       host = strchr(buf, '@');
 451       if (host != NULL)
 452       {
 453          /* Contains username/password, skip it and the @ sign. */
 454          host++;
 455       }
 456       else
 457       {
 458          /* No username or password. */
 459          host = buf;
 460       }
 461
 462       /* check if url contains port */
 463       port = strchr(host, ':');
 464       if (port != NULL)
 465       {
 466          /* Contains port */
 467          /* Terminate hostname and point to start of port string */
 468          *port++ = '\0';
 469          http->port = atoi(port);
 470       }
 471       else
 472       {
 473          /* No port specified. */
 474          http->port = (http->ssl ? 443 : 80);
 475       }
 476
 477       http->host = strdup(host);
 478
 479       free(buf);
 480
 481       if (http->host == NULL)
 482       {
 483          return JB_ERR_MEMORY;
 484       }
 485    }
 486
 487    /*
 488     * Split domain name so we can compare it against wildcards
 489     */
 490    return init_domain_components(http);
 491
 492 }
 493
 494
 495 /*********************************************************************
 496  *
 497  * Function    :  unknown_method
 498  *
 499  * Description :  Checks whether a method is unknown.
 500  *
 501  * Parameters  :
 502  *          1  :  method = points to a http method
 503  *
 504  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 505  *
 506  *********************************************************************/
 507 static int unknown_method(const char *method)
 508 {
 509    static const char *known_http_methods[] = {
 510       /* Basic HTTP request type */
 511       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 512       /* webDAV extensions (RFC2518) */
 513       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 514       /*
 515        * Microsoft webDAV extension for Exchange 2000.  See:
 516        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 517        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 518        */
 519       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 520       /*
 521        * Another Microsoft webDAV extension for Exchange 2000.  See:
 522        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 523        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 524        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 525        */
 526       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 527       /*
 528        * Yet another WebDAV extension, this time for
 529        * Web Distributed Authoring and Versioning (RFC3253)
 530        */
 531       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 532       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 533    };
 534    int i;
 535
 536    for (i = 0; i < SZ(known_http_methods); i++)
 537    {
 538       if (0 == strcmpic(method, known_http_methods[i]))
 539       {
 540          return FALSE;
 541       }
 542    }
 543
 544    return TRUE;
 545
 546 }
 547
 548
 549 /*********************************************************************
 550  *
 551  * Function    :  parse_http_request
 552  *
 553  * Description :  Parse out the host and port from the URL.  Find the
 554  *                hostname & path, port (if ':'), and/or password (if '@')
 555  *
 556  * Parameters  :
 557  *          1  :  req = HTTP request line to break down
 558  *          2  :  http = pointer to the http structure to hold elements
 559  *          3  :  csp = Current client state (buffers, headers, etc...)
 560  *
 561  * Returns     :  JB_ERR_OK on success
 562  *                JB_ERR_MEMORY on out of memory
 563  *                JB_ERR_CGI_PARAMS on malformed command/URL
 564  *                                  or >100 domains deep.
 565  *
 566  *********************************************************************/
 567 jb_err parse_http_request(const char *req,
 568                           struct http_request *http,
 569                           const struct client_state *csp)
 570 {
 571    char *buf;
 572    char *v[10]; /* XXX: Why 10? We should only need three. */
 573    int n;
 574    jb_err err;
 575    int is_connect = 0;
 576
 577    memset(http, '\0', sizeof(*http));
 578
 579    buf = strdup(req);
 580    if (buf == NULL)
 581    {
 582       return JB_ERR_MEMORY;
 583    }
 584
 585    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 586    if (n != 3)
 587    {
 588       free(buf);
 589       return JB_ERR_PARSE;
 590    }
 591
 592    /*
 593     * Fail in case of unknown methods
 594     * which we might not handle correctly.
 595     *
 596     * XXX: There should be a config option
 597     * to forward requests with unknown methods
 598     * anyway. Most of them don't need special
 599     * steps.
 600     */
 601    if (unknown_method(v[0]))
 602    {
 603       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 604       free(buf);
 605       return JB_ERR_PARSE;
 606    }
 607
 608    if (strcmpic(v[0], "CONNECT") == 0)
 609    {
 610       is_connect = 1;
 611    }
 612
 613    err = parse_http_url(v[1], http, csp);
 614    if (err)
 615    {
 616       free(buf);
 617       return err;
 618    }
 619
 620    /*
 621     * Copy the details into the structure
 622     */
 623    http->ssl = is_connect;
 624    http->cmd = strdup(req);
 625    http->gpc = strdup(v[0]);
 626    http->ver = strdup(v[2]);
 627
 628    if ( (http->cmd == NULL)
 629      || (http->gpc == NULL)
 630      || (http->ver == NULL) )
 631    {
 632       free(buf);
 633       return JB_ERR_MEMORY;
 634    }
 635
 636    free(buf);
 637    return JB_ERR_OK;
 638
 639 }
 640
 641
 642 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 643 /*********************************************************************
 644  *
 645  * Function    :  compile_host_pattern
 646  *
 647  * Description :  Parses and compiles a PCRE host pattern..
 648  *
 649  * Parameters  :
 650  *          1  :  url = Target url_spec to be filled in.
 651  *          2  :  host_pattern = Host pattern to compile.
 652  *
 653  * Returns     :  JB_ERR_OK - Success
 654  *                JB_ERR_MEMORY - Out of memory
 655  *                JB_ERR_PARSE - Cannot parse regex
 656  *
 657  *********************************************************************/
 658 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 659 {
 660    int errcode;
 661    char rebuf[BUFFER_SIZE];
 662
 663    assert(host_pattern);
 664    assert(strlen(host_pattern) < sizeof(rebuf) - 2);
 665
 666    url->host_regex = zalloc(sizeof(*url->host_regex));
 667    if (NULL == url->host_regex)
 668    {
 669       free_url_spec(url);
 670       return JB_ERR_MEMORY;
 671    }
 672
 673    snprintf(rebuf, sizeof(rebuf), "%s$", host_pattern);
 674
 675    errcode = regcomp(url->host_regex, rebuf,
 676       (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 677
 678    if (errcode)
 679    {
 680       size_t errlen = regerror(errcode, url->host_regex, rebuf, sizeof(rebuf));
 681       if (errlen > (sizeof(rebuf) - (size_t)1))
 682       {
 683          errlen = sizeof(rebuf) - (size_t)1;
 684       }
 685       rebuf[errlen] = '\0';
 686       log_error(LOG_LEVEL_ERROR, "error compiling %s: %s", url->spec, rebuf);
 687       free_url_spec(url);
 688
 689       return JB_ERR_PARSE;
 690    }
 691
 692    return JB_ERR_OK;
 693
 694 }
 695
 696 #else
 697
 698 /*********************************************************************
 699  *
 700  * Function    :  compile_host_pattern
 701  *
 702  * Description :  Parses and "compiles" an old-school host pattern.
 703  *
 704  * Parameters  :
 705  *          1  :  url = Target url_spec to be filled in.
 706  *          2  :  host_pattern = Host pattern to parse.
 707  *
 708  * Returns     :  JB_ERR_OK - Success
 709  *                JB_ERR_MEMORY - Out of memory
 710  *                JB_ERR_PARSE - Cannot parse regex
 711  *
 712  *********************************************************************/
 713 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 714 {
 715    char *v[150];
 716    size_t size;
 717    char *p;
 718
 719    /*
 720     * Parse domain part
 721     */
 722    if (host_pattern[strlen(host_pattern) - 1] == '.')
 723    {
 724       url->unanchored |= ANCHOR_RIGHT;
 725    }
 726    if (host_pattern[0] == '.')
 727    {
 728       url->unanchored |= ANCHOR_LEFT;
 729    }
 730
 731    /*
 732     * Split domain into components
 733     */
 734    url->dbuffer = strdup(host_pattern);
 735    if (NULL == url->dbuffer)
 736    {
 737       freez(url->spec);
 738       freez(url->path);
 739       regfree(url->preg);
 740       freez(url->preg);
 741       return JB_ERR_MEMORY;
 742    }
 743
 744    /*
 745     * Map to lower case
 746     */
 747    for (p = url->dbuffer; *p ; p++)
 748    {
 749       *p = (char)tolower((int)(unsigned char)*p);
 750    }
 751
 752    /*
 753     * Split the domain name into components
 754     */
 755    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 756
 757    if (url->dcount < 0)
 758    {
 759       freez(url->spec);
 760       freez(url->path);
 761       regfree(url->preg);
 762       freez(url->preg);
 763       freez(url->dbuffer);
 764       url->dcount = 0;
 765       return JB_ERR_MEMORY;
 766    }
 767    else if (url->dcount != 0)
 768    {
 769       /*
 770        * Save a copy of the pointers in dvec
 771        */
 772       size = (size_t)url->dcount * sizeof(*url->dvec);
 773
 774       url->dvec = (char **)malloc(size);
 775       if (NULL == url->dvec)
 776       {
 777          freez(url->spec);
 778          freez(url->path);
 779          regfree(url->preg);
 780          freez(url->preg);
 781          freez(url->dbuffer);
 782          url->dcount = 0;
 783          return JB_ERR_MEMORY;
 784       }
 785
 786       memcpy(url->dvec, v, size);
 787    }
 788    /*
 789     * else dcount == 0 in which case we needn't do anything,
 790     * since dvec will never be accessed and the pattern will
 791     * match all domains.
 792     */
 793    return JB_ERR_OK;
 794 }
 795
 796
 797 /*********************************************************************
 798  *
 799  * Function    :  simple_domaincmp
 800  *
 801  * Description :  Domain-wise Compare fqdn's.  The comparison is
 802  *                both left- and right-anchored.  The individual
 803  *                domain names are compared with simplematch().
 804  *                This is only used by domain_match.
 805  *
 806  * Parameters  :
 807  *          1  :  pv = array of patterns to compare
 808  *          2  :  fv = array of domain components to compare
 809  *          3  :  len = length of the arrays (both arrays are the
 810  *                      same length - if they weren't, it couldn't
 811  *                      possibly be a match).
 812  *
 813  * Returns     :  0 => domains are equivalent, else no match.
 814  *
 815  *********************************************************************/
 816 static int simple_domaincmp(char **pv, char **fv, int len)
 817 {
 818    int n;
 819
 820    for (n = 0; n < len; n++)
 821    {
 822       if (simplematch(pv[n], fv[n]))
 823       {
 824          return 1;
 825       }
 826    }
 827
 828    return 0;
 829
 830 }
 831
 832
 833 /*********************************************************************
 834  *
 835  * Function    :  domain_match
 836  *
 837  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 838  *                pattern->unachored, the comparison is un-, left-,
 839  *                right-anchored, or both.
 840  *                The individual domain names are compared with
 841  *                simplematch().
 842  *
 843  * Parameters  :
 844  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 845  *          2  :  fqdn = domain name against which the patterns are compared.
 846  *
 847  * Returns     :  0 => domains are equivalent, else no match.
 848  *
 849  *********************************************************************/
 850 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 851 {
 852    char **pv, **fv;  /* vectors  */
 853    int    plen, flen;
 854    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 855
 856    plen = pattern->dcount;
 857    flen = fqdn->dcount;
 858
 859    if (flen < plen)
 860    {
 861       /* fqdn is too short to match this pattern */
 862       return 1;
 863    }
 864
 865    pv   = pattern->dvec;
 866    fv   = fqdn->dvec;
 867
 868    if (unanchored == ANCHOR_LEFT)
 869    {
 870       /*
 871        * Right anchored.
 872        *
 873        * Convert this into a fully anchored pattern with
 874        * the fqdn and pattern the same length
 875        */
 876       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 877       return simple_domaincmp(pv, fv, plen);
 878    }
 879    else if (unanchored == 0)
 880    {
 881       /* Fully anchored, check length */
 882       if (flen != plen)
 883       {
 884          return 1;
 885       }
 886       return simple_domaincmp(pv, fv, plen);
 887    }
 888    else if (unanchored == ANCHOR_RIGHT)
 889    {
 890       /* Left anchored, ignore all extra in fqdn */
 891       return simple_domaincmp(pv, fv, plen);
 892    }
 893    else
 894    {
 895       /* Unanchored */
 896       int n;
 897       int maxn = flen - plen;
 898       for (n = 0; n <= maxn; n++)
 899       {
 900          if (!simple_domaincmp(pv, fv, plen))
 901          {
 902             return 0;
 903          }
 904          /*
 905           * Doesn't match from start of fqdn
 906           * Try skipping first part of fqdn
 907           */
 908          fv++;
 909       }
 910       return 1;
 911    }
 912
 913 }
 914 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 915
 916
 917 /*********************************************************************
 918  *
 919  * Function    :  create_url_spec
 920  *
 921  * Description :  Creates a "url_spec" structure from a string.
 922  *                When finished, free with free_url_spec().
 923  *
 924  * Parameters  :
 925  *          1  :  url = Target url_spec to be filled in.  Will be
 926  *                      zeroed before use.
 927  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 928  *                      contents of this buffer are destroyed by this
 929  *                      function.  If this function succeeds, the
 930  *                      buffer is copied to url->spec.  If this
 931  *                      function fails, the contents of the buffer
 932  *                      are lost forever.
 933  *
 934  * Returns     :  JB_ERR_OK - Success
 935  *                JB_ERR_MEMORY - Out of memory
 936  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 937  *                               written to system log)
 938  *
 939  *********************************************************************/
 940 jb_err create_url_spec(struct url_spec * url, const char * buf)
 941 {
 942    char *p;
 943    int errcode;
 944    size_t errlen;
 945    char rebuf[BUFFER_SIZE];
 946
 947    assert(url);
 948    assert(buf);
 949
 950    /*
 951     * Zero memory
 952     */
 953    memset(url, '\0', sizeof(*url));
 954
 955    /*
 956     * Save a copy of the orignal specification
 957     */
 958    if ((url->spec = strdup(buf)) == NULL)
 959    {
 960       return JB_ERR_MEMORY;
 961    }
 962
 963    /* Is it tag pattern? */
 964    if (0 == strncmpic("TAG:", url->spec, 4))
 965    {
 966       if (NULL == (url->tag_regex = zalloc(sizeof(*url->tag_regex))))
 967       {
 968          freez(url->spec);
 969          return JB_ERR_MEMORY;
 970       }
 971
 972       /* buf + 4 to skip "TAG:" */
 973       errcode = regcomp(url->tag_regex, buf + 4, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 974       if (errcode)
 975       {
 976          errlen = regerror(errcode, url->preg, rebuf, sizeof(rebuf));
 977          if (errlen > (sizeof(rebuf) - 1))
 978          {
 979             errlen = sizeof(rebuf) - 1;
 980          }
 981          rebuf[errlen] = '\0';
 982
 983          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s", url->spec, rebuf);
 984
 985          freez(url->spec);
 986          regfree(url->tag_regex);
 987          freez(url->tag_regex);
 988
 989          return JB_ERR_PARSE;
 990       }
 991       return JB_ERR_OK;
 992    }
 993
 994    /* Only reached for URL patterns */
 995    p = strchr(buf, '/');
 996    if (NULL != p)
 997    {
 998       url->path = strdup(p);
 999       if (NULL == url->path)
1000       {
1001          freez(url->spec);
1002          return JB_ERR_MEMORY;
1003       }
1004       *p = '\0';
1005    }
1006    else
1007    {
1008       url->path = NULL;
1009    }
1010    if (url->path)
1011    {
1012       if (NULL == (url->preg = zalloc(sizeof(*url->preg))))
1013       {
1014          freez(url->spec);
1015          freez(url->path);
1016          return JB_ERR_MEMORY;
1017       }
1018
1019       snprintf(rebuf, sizeof(rebuf), "^(%s)", url->path);
1020
1021       errcode = regcomp(url->preg, rebuf,
1022             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
1023       if (errcode)
1024       {
1025          errlen = regerror(errcode, url->preg, rebuf, sizeof(rebuf));
1026
1027          if (errlen > (sizeof(rebuf) - (size_t)1))
1028          {
1029             errlen = sizeof(rebuf) - (size_t)1;
1030          }
1031          rebuf[errlen] = '\0';
1032
1033          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
1034             url->spec, rebuf);
1035
1036          freez(url->spec);
1037          freez(url->path);
1038          regfree(url->preg);
1039          freez(url->preg);
1040
1041          return JB_ERR_PARSE;
1042       }
1043    }
1044
1045    p = strchr(buf, ':');
1046    if (NULL != p)
1047    {
1048       *p++ = '\0';
1049       url->port_list = strdup(p);
1050       if (NULL == url->port_list)
1051       {
1052          return JB_ERR_MEMORY;
1053       }
1054    }
1055    else
1056    {
1057       url->port_list = NULL;
1058    }
1059
1060    if (buf[0] != '\0')
1061    {
1062       return compile_host_pattern(url, buf);
1063    }
1064
1065    return JB_ERR_OK;
1066
1067 }
1068
1069
1070 /*********************************************************************
1071  *
1072  * Function    :  free_url_spec
1073  *
1074  * Description :  Called from the "unloaders".  Freez the url
1075  *                structure elements.
1076  *
1077  * Parameters  :
1078  *          1  :  url = pointer to a url_spec structure.
1079  *
1080  * Returns     :  N/A
1081  *
1082  *********************************************************************/
1083 void free_url_spec(struct url_spec *url)
1084 {
1085    if (url == NULL) return;
1086
1087    freez(url->spec);
1088 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1089    if (url->host_regex)
1090    {
1091       regfree(url->host_regex);
1092       freez(url->host_regex);
1093    }
1094 #else
1095    freez(url->dbuffer);
1096    freez(url->dvec);
1097 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1098    freez(url->path);
1099    freez(url->port_list);
1100    if (url->preg)
1101    {
1102       regfree(url->preg);
1103       freez(url->preg);
1104    }
1105    if (url->tag_regex)
1106    {
1107       regfree(url->tag_regex);
1108       freez(url->tag_regex);
1109    }
1110 }
1111
1112
1113 /*********************************************************************
1114  *
1115  * Function    :  url_match
1116  *
1117  * Description :  Compare a URL against a URL pattern.
1118  *
1119  * Parameters  :
1120  *          1  :  pattern = a URL pattern
1121  *          2  :  url = URL to match
1122  *
1123  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1124  *
1125  *********************************************************************/
1126 int url_match(const struct url_spec *pattern,
1127               const struct http_request *url)
1128 {
1129    /* XXX: these should probably be functions. */
1130 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, url->port))
1131 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1132 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, url->host, 0, NULL, 0)))
1133 #else
1134 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, url)))
1135 #endif
1136 #define PATH_MATCHES ((NULL == pattern->path) || (0 == regexec(pattern->preg, url->path, 0, NULL, 0)))
1137
1138    if (pattern->tag_regex != NULL)
1139    {
1140       /* It's a tag pattern and shouldn't be matched against URLs */
1141       return 0;
1142    }
1143
1144    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1145
1146 }
1147
1148
1149 /*********************************************************************
1150  *
1151  * Function    :  match_portlist
1152  *
1153  * Description :  Check if a given number is covered by a comma
1154  *                separated list of numbers and ranges (a,b-c,d,..)
1155  *
1156  * Parameters  :
1157  *          1  :  portlist = String with list
1158  *          2  :  port = port to check
1159  *
1160  * Returns     :  0 => no match
1161  *                1 => match
1162  *
1163  *********************************************************************/
1164 int match_portlist(const char *portlist, int port)
1165 {
1166    char *min, *max, *next, *portlist_copy;
1167
1168    min = next = portlist_copy = strdup(portlist);
1169
1170    /*
1171     * Zero-terminate first item and remember offset for next
1172     */
1173    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1174    {
1175       *next++ = '\0';
1176    }
1177
1178    /*
1179     * Loop through all items, checking for match
1180     */
1181    while(min)
1182    {
1183       if (NULL == (max = strchr(min, (int) '-')))
1184       {
1185          /*
1186           * No dash, check for equality
1187           */
1188          if (port == atoi(min))
1189          {
1190             free(portlist_copy);
1191             return(1);
1192          }
1193       }
1194       else
1195       {
1196          /*
1197           * This is a range, so check if between min and max,
1198           * or, if max was omitted, between min and 65K
1199           */
1200          *max++ = '\0';
1201          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1202          {
1203             free(portlist_copy);
1204             return(1);
1205          }
1206
1207       }
1208
1209       /*
1210        * Jump to next item
1211        */
1212       min = next;
1213
1214       /*
1215        * Zero-terminate next item and remember offset for n+1
1216        */
1217       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1218       {
1219          *next++ = '\0';
1220       }
1221    }
1222
1223    free(portlist_copy);
1224    return 0;
1225
1226 }
1227
1228
1229 /*
1230   Local Variables:
1231   tab-width: 3
1232   end:
1233 */