urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.59 2009/06/10 13:17:17 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2009
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_MEMORY on out of memory
 119  *                JB_ERR_PARSE on malformed command/URL
 120  *                             or >100 domains deep.
 121  *
 122  *********************************************************************/
 123 jb_err init_domain_components(struct http_request *http)
 124 {
 125    char *vec[BUFFER_SIZE];
 126    size_t size;
 127    char *p;
 128
 129    http->dbuffer = strdup(http->host);
 130    if (NULL == http->dbuffer)
 131    {
 132       return JB_ERR_MEMORY;
 133    }
 134
 135    /* map to lower case */
 136    for (p = http->dbuffer; *p ; p++)
 137    {
 138       *p = (char)tolower((int)(unsigned char)*p);
 139    }
 140
 141    /* split the domain name into components */
 142    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 143
 144    if (http->dcount <= 0)
 145    {
 146       /*
 147        * Error: More than SZ(vec) components in domain
 148        *    or: no components in domain
 149        */
 150       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 151       return JB_ERR_PARSE;
 152    }
 153
 154    /* save a copy of the pointers in dvec */
 155    size = (size_t)http->dcount * sizeof(*http->dvec);
 156
 157    http->dvec = (char **)malloc(size);
 158    if (NULL == http->dvec)
 159    {
 160       return JB_ERR_MEMORY;
 161    }
 162
 163    memcpy(http->dvec, vec, size);
 164
 165    return JB_ERR_OK;
 166 }
 167 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 168
 169
 170 /*********************************************************************
 171  *
 172  * Function    :  parse_http_url
 173  *
 174  * Description :  Parse out the host and port from the URL.  Find the
 175  *                hostname & path, port (if ':'), and/or password (if '@')
 176  *
 177  * Parameters  :
 178  *          1  :  url = URL (or is it URI?) to break down
 179  *          2  :  http = pointer to the http structure to hold elements.
 180  *                       Must be initialized with valid values (like NULLs).
 181  *          3  :  require_protocol = Whether or not URLs without
 182  *                                   protocol are acceptable.
 183  *
 184  * Returns     :  JB_ERR_OK on success
 185  *                JB_ERR_MEMORY on out of memory
 186  *                JB_ERR_PARSE on malformed command/URL
 187  *                             or >100 domains deep.
 188  *
 189  *********************************************************************/
 190 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 191 {
 192    int host_available = 1; /* A proxy can dream. */
 193
 194    /*
 195     * Save our initial URL
 196     */
 197    http->url = strdup(url);
 198    if (http->url == NULL)
 199    {
 200       return JB_ERR_MEMORY;
 201    }
 202
 203
 204    /*
 205     * Check for * URI. If found, we're done.
 206     */
 207    if (*http->url == '*')
 208    {
 209       if  ( NULL == (http->path = strdup("*"))
 210          || NULL == (http->hostport = strdup("")) )
 211       {
 212          return JB_ERR_MEMORY;
 213       }
 214       if (http->url[1] != '\0')
 215       {
 216          return JB_ERR_PARSE;
 217       }
 218       return JB_ERR_OK;
 219    }
 220
 221
 222    /*
 223     * Split URL into protocol,hostport,path.
 224     */
 225    {
 226       char *buf;
 227       char *url_noproto;
 228       char *url_path;
 229
 230       buf = strdup(url);
 231       if (buf == NULL)
 232       {
 233          return JB_ERR_MEMORY;
 234       }
 235
 236       /* Find the start of the URL in our scratch space */
 237       url_noproto = buf;
 238       if (strncmpic(url_noproto, "http://",  7) == 0)
 239       {
 240          url_noproto += 7;
 241       }
 242       else if (strncmpic(url_noproto, "https://", 8) == 0)
 243       {
 244          /*
 245           * Should only happen when called from cgi_show_url_info().
 246           */
 247          url_noproto += 8;
 248          http->ssl = 1;
 249       }
 250       else if (*url_noproto == '/')
 251       {
 252         /*
 253          * Short request line without protocol and host.
 254          * Most likely because the client's request
 255          * was intercepted and redirected into Privoxy.
 256          */
 257          http->host = NULL;
 258          host_available = 0;
 259       }
 260       else if (require_protocol)
 261       {
 262          freez(buf);
 263          return JB_ERR_PARSE;
 264       }
 265
 266       url_path = strchr(url_noproto, '/');
 267       if (url_path != NULL)
 268       {
 269          /*
 270           * Got a path.
 271           *
 272           * NOTE: The following line ignores the path for HTTPS URLS.
 273           * This means that you get consistent behaviour if you type a
 274           * https URL in and it's parsed by the function.  (When the
 275           * URL is actually retrieved, SSL hides the path part).
 276           */
 277          http->path = strdup(http->ssl ? "/" : url_path);
 278          *url_path = '\0';
 279          http->hostport = strdup(url_noproto);
 280       }
 281       else
 282       {
 283          /*
 284           * Repair broken HTTP requests that don't contain a path,
 285           * or CONNECT requests
 286           */
 287          http->path = strdup("/");
 288          http->hostport = strdup(url_noproto);
 289       }
 290
 291       freez(buf);
 292
 293       if ( (http->path == NULL)
 294         || (http->hostport == NULL))
 295       {
 296          return JB_ERR_MEMORY;
 297       }
 298    }
 299
 300    if (!host_available)
 301    {
 302       /* Without host, there is nothing left to do here */
 303       return JB_ERR_OK;
 304    }
 305
 306    /*
 307     * Split hostport into user/password (ignored), host, port.
 308     */
 309    {
 310       char *buf;
 311       char *host;
 312       char *port;
 313
 314       buf = strdup(http->hostport);
 315       if (buf == NULL)
 316       {
 317          return JB_ERR_MEMORY;
 318       }
 319
 320       /* check if url contains username and/or password */
 321       host = strchr(buf, '@');
 322       if (host != NULL)
 323       {
 324          /* Contains username/password, skip it and the @ sign. */
 325          host++;
 326       }
 327       else
 328       {
 329          /* No username or password. */
 330          host = buf;
 331       }
 332
 333       /* Move after hostname before port number */
 334       if (*host == '[')
 335       {
 336          /* Numeric IPv6 address delimited by brackets */
 337          host++;
 338          port = strchr(host, ']');
 339
 340          if (port == NULL)
 341          {
 342             /* Missing closing bracket */
 343             freez(buf);
 344             return JB_ERR_PARSE;
 345          }
 346
 347          *port++ = '\0';
 348
 349          if (*port == '\0')
 350          {
 351             port = NULL;
 352          }
 353          else if (*port != ':')
 354          {
 355             /* Garbage after closing bracket */
 356             freez(buf);
 357             return JB_ERR_PARSE;
 358          }
 359       }
 360       else
 361       {
 362          /* Plain non-escaped hostname */
 363          port = strchr(host, ':');
 364       }
 365
 366       /* check if url contains port */
 367       if (port != NULL)
 368       {
 369          /* Contains port */
 370          /* Terminate hostname and point to start of port string */
 371          *port++ = '\0';
 372          http->port = atoi(port);
 373       }
 374       else
 375       {
 376          /* No port specified. */
 377          http->port = (http->ssl ? 443 : 80);
 378       }
 379
 380       http->host = strdup(host);
 381
 382       freez(buf);
 383
 384       if (http->host == NULL)
 385       {
 386          return JB_ERR_MEMORY;
 387       }
 388    }
 389
 390 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 391    return JB_ERR_OK;
 392 #else
 393    /* Split domain name so we can compare it against wildcards */
 394    return init_domain_components(http);
 395 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 396
 397 }
 398
 399
 400 /*********************************************************************
 401  *
 402  * Function    :  unknown_method
 403  *
 404  * Description :  Checks whether a method is unknown.
 405  *
 406  * Parameters  :
 407  *          1  :  method = points to a http method
 408  *
 409  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 410  *
 411  *********************************************************************/
 412 static int unknown_method(const char *method)
 413 {
 414    static const char *known_http_methods[] = {
 415       /* Basic HTTP request type */
 416       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 417       /* webDAV extensions (RFC2518) */
 418       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 419       /*
 420        * Microsoft webDAV extension for Exchange 2000.  See:
 421        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 422        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 423        */
 424       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 425       /*
 426        * Another Microsoft webDAV extension for Exchange 2000.  See:
 427        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 428        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 429        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 430        */
 431       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 432       /*
 433        * Yet another WebDAV extension, this time for
 434        * Web Distributed Authoring and Versioning (RFC3253)
 435        */
 436       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 437       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 438    };
 439    int i;
 440
 441    for (i = 0; i < SZ(known_http_methods); i++)
 442    {
 443       if (0 == strcmpic(method, known_http_methods[i]))
 444       {
 445          return FALSE;
 446       }
 447    }
 448
 449    return TRUE;
 450
 451 }
 452
 453
 454 /*********************************************************************
 455  *
 456  * Function    :  parse_http_request
 457  *
 458  * Description :  Parse out the host and port from the URL.  Find the
 459  *                hostname & path, port (if ':'), and/or password (if '@')
 460  *
 461  * Parameters  :
 462  *          1  :  req = HTTP request line to break down
 463  *          2  :  http = pointer to the http structure to hold elements
 464  *
 465  * Returns     :  JB_ERR_OK on success
 466  *                JB_ERR_MEMORY on out of memory
 467  *                JB_ERR_CGI_PARAMS on malformed command/URL
 468  *                                  or >100 domains deep.
 469  *
 470  *********************************************************************/
 471 jb_err parse_http_request(const char *req, struct http_request *http)
 472 {
 473    char *buf;
 474    char *v[10]; /* XXX: Why 10? We should only need three. */
 475    int n;
 476    jb_err err;
 477
 478    memset(http, '\0', sizeof(*http));
 479
 480    buf = strdup(req);
 481    if (buf == NULL)
 482    {
 483       return JB_ERR_MEMORY;
 484    }
 485
 486    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 487    if (n != 3)
 488    {
 489       freez(buf);
 490       return JB_ERR_PARSE;
 491    }
 492
 493    /*
 494     * Fail in case of unknown methods
 495     * which we might not handle correctly.
 496     *
 497     * XXX: There should be a config option
 498     * to forward requests with unknown methods
 499     * anyway. Most of them don't need special
 500     * steps.
 501     */
 502    if (unknown_method(v[0]))
 503    {
 504       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 505       freez(buf);
 506       return JB_ERR_PARSE;
 507    }
 508
 509    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 510    {
 511       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 512          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 513       freez(buf);
 514       return JB_ERR_PARSE;
 515    }
 516
 517    http->ssl = !strcmpic(v[0], "CONNECT");
 518
 519    err = parse_http_url(v[1], http, !http->ssl);
 520    if (err)
 521    {
 522       freez(buf);
 523       return err;
 524    }
 525
 526    /*
 527     * Copy the details into the structure
 528     */
 529    http->cmd = strdup(req);
 530    http->gpc = strdup(v[0]);
 531    http->ver = strdup(v[2]);
 532
 533    freez(buf);
 534
 535    if ( (http->cmd == NULL)
 536      || (http->gpc == NULL)
 537      || (http->ver == NULL) )
 538    {
 539       return JB_ERR_MEMORY;
 540    }
 541
 542    return JB_ERR_OK;
 543
 544 }
 545
 546
 547 /*********************************************************************
 548  *
 549  * Function    :  compile_pattern
 550  *
 551  * Description :  Compiles a host, domain or TAG pattern.
 552  *
 553  * Parameters  :
 554  *          1  :  pattern = The pattern to compile.
 555  *          2  :  anchoring = How the regex should be modified
 556  *                            before compilation. Can be either
 557  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 558  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 559  *          3  :  url     = In case of failures, the spec member is
 560  *                          logged and the structure freed.
 561  *          4  :  regex   = Where the compiled regex should be stored.
 562  *
 563  * Returns     :  JB_ERR_OK - Success
 564  *                JB_ERR_MEMORY - Out of memory
 565  *                JB_ERR_PARSE - Cannot parse regex
 566  *
 567  *********************************************************************/
 568 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 569                               struct url_spec *url, regex_t **regex)
 570 {
 571    int errcode;
 572    char rebuf[BUFFER_SIZE];
 573    const char *fmt = NULL;
 574
 575    assert(pattern);
 576    assert(strlen(pattern) < sizeof(rebuf) - 2);
 577
 578    if (pattern[0] == '\0')
 579    {
 580       *regex = NULL;
 581       return JB_ERR_OK;
 582    }
 583
 584    switch (anchoring)
 585    {
 586       case NO_ANCHORING:
 587          fmt = "%s";
 588          break;
 589       case RIGHT_ANCHORED:
 590          fmt = "%s$";
 591          break;
 592       case RIGHT_ANCHORED_HOST:
 593          fmt = "%s\\.?$";
 594          break;
 595       case LEFT_ANCHORED:
 596          fmt = "^%s";
 597          break;
 598       default:
 599          log_error(LOG_LEVEL_FATAL,
 600             "Invalid anchoring in compile_pattern %d", anchoring);
 601    }
 602
 603    *regex = zalloc(sizeof(**regex));
 604    if (NULL == *regex)
 605    {
 606       free_url_spec(url);
 607       return JB_ERR_MEMORY;
 608    }
 609
 610    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 611
 612    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 613
 614    if (errcode)
 615    {
 616       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 617       if (errlen > (sizeof(rebuf) - (size_t)1))
 618       {
 619          errlen = sizeof(rebuf) - (size_t)1;
 620       }
 621       rebuf[errlen] = '\0';
 622       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 623          pattern, url->spec, rebuf);
 624       free_url_spec(url);
 625
 626       return JB_ERR_PARSE;
 627    }
 628
 629    return JB_ERR_OK;
 630
 631 }
 632
 633
 634 /*********************************************************************
 635  *
 636  * Function    :  compile_url_pattern
 637  *
 638  * Description :  Compiles the three parts of an URL pattern.
 639  *
 640  * Parameters  :
 641  *          1  :  url = Target url_spec to be filled in.
 642  *          2  :  buf = The url pattern to compile. Will be messed up.
 643  *
 644  * Returns     :  JB_ERR_OK - Success
 645  *                JB_ERR_MEMORY - Out of memory
 646  *                JB_ERR_PARSE - Cannot parse regex
 647  *
 648  *********************************************************************/
 649 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 650 {
 651    char *p;
 652
 653    p = strchr(buf, '/');
 654    if (NULL != p)
 655    {
 656       /*
 657        * Only compile the regex if it consists of more than
 658        * a single slash, otherwise it wouldn't affect the result.
 659        */
 660       if (p[1] != '\0')
 661       {
 662          /*
 663           * XXX: does it make sense to compile the slash at the beginning?
 664           */
 665          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 666
 667          if (JB_ERR_OK != err)
 668          {
 669             return err;
 670          }
 671       }
 672       *p = '\0';
 673    }
 674
 675    /*
 676     * IPv6 numeric hostnames can contain colons, thus we need
 677     * to delimit the hostname before the real port separator.
 678     * As brackets are already used in the hostname pattern,
 679     * we use angle brackets ('<', '>') instead.
 680     */
 681    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 682    {
 683       *p++ = '\0';
 684       buf++;
 685
 686       if (*p == '\0')
 687       {
 688          /* IPv6 address without port number */
 689          p = NULL;
 690       }
 691       else if (*p != ':')
 692       {
 693          /* Garbage after address delimiter */
 694          return JB_ERR_PARSE;
 695       }
 696    }
 697    else
 698    {
 699       p = strchr(buf, ':');
 700    }
 701
 702    if (NULL != p)
 703    {
 704       *p++ = '\0';
 705       url->port_list = strdup(p);
 706       if (NULL == url->port_list)
 707       {
 708          return JB_ERR_MEMORY;
 709       }
 710    }
 711    else
 712    {
 713       url->port_list = NULL;
 714    }
 715
 716    if (buf[0] != '\0')
 717    {
 718       return compile_host_pattern(url, buf);
 719    }
 720
 721    return JB_ERR_OK;
 722
 723 }
 724
 725
 726 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 727 /*********************************************************************
 728  *
 729  * Function    :  compile_host_pattern
 730  *
 731  * Description :  Parses and compiles a host pattern..
 732  *
 733  * Parameters  :
 734  *          1  :  url = Target url_spec to be filled in.
 735  *          2  :  host_pattern = Host pattern to compile.
 736  *
 737  * Returns     :  JB_ERR_OK - Success
 738  *                JB_ERR_MEMORY - Out of memory
 739  *                JB_ERR_PARSE - Cannot parse regex
 740  *
 741  *********************************************************************/
 742 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 743 {
 744    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 745 }
 746
 747 #else
 748
 749 /*********************************************************************
 750  *
 751  * Function    :  compile_host_pattern
 752  *
 753  * Description :  Parses and "compiles" an old-school host pattern.
 754  *
 755  * Parameters  :
 756  *          1  :  url = Target url_spec to be filled in.
 757  *          2  :  host_pattern = Host pattern to parse.
 758  *
 759  * Returns     :  JB_ERR_OK - Success
 760  *                JB_ERR_MEMORY - Out of memory
 761  *                JB_ERR_PARSE - Cannot parse regex
 762  *
 763  *********************************************************************/
 764 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 765 {
 766    char *v[150];
 767    size_t size;
 768    char *p;
 769
 770    /*
 771     * Parse domain part
 772     */
 773    if (host_pattern[strlen(host_pattern) - 1] == '.')
 774    {
 775       url->unanchored |= ANCHOR_RIGHT;
 776    }
 777    if (host_pattern[0] == '.')
 778    {
 779       url->unanchored |= ANCHOR_LEFT;
 780    }
 781
 782    /*
 783     * Split domain into components
 784     */
 785    url->dbuffer = strdup(host_pattern);
 786    if (NULL == url->dbuffer)
 787    {
 788       free_url_spec(url);
 789       return JB_ERR_MEMORY;
 790    }
 791
 792    /*
 793     * Map to lower case
 794     */
 795    for (p = url->dbuffer; *p ; p++)
 796    {
 797       *p = (char)tolower((int)(unsigned char)*p);
 798    }
 799
 800    /*
 801     * Split the domain name into components
 802     */
 803    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 804
 805    if (url->dcount < 0)
 806    {
 807       free_url_spec(url);
 808       return JB_ERR_MEMORY;
 809    }
 810    else if (url->dcount != 0)
 811    {
 812       /*
 813        * Save a copy of the pointers in dvec
 814        */
 815       size = (size_t)url->dcount * sizeof(*url->dvec);
 816
 817       url->dvec = (char **)malloc(size);
 818       if (NULL == url->dvec)
 819       {
 820          free_url_spec(url);
 821          return JB_ERR_MEMORY;
 822       }
 823
 824       memcpy(url->dvec, v, size);
 825    }
 826    /*
 827     * else dcount == 0 in which case we needn't do anything,
 828     * since dvec will never be accessed and the pattern will
 829     * match all domains.
 830     */
 831    return JB_ERR_OK;
 832 }
 833
 834
 835 /*********************************************************************
 836  *
 837  * Function    :  simplematch
 838  *
 839  * Description :  String matching, with a (greedy) '*' wildcard that
 840  *                stands for zero or more arbitrary characters and
 841  *                character classes in [], which take both enumerations
 842  *                and ranges.
 843  *
 844  * Parameters  :
 845  *          1  :  pattern = pattern for matching
 846  *          2  :  text    = text to be matched
 847  *
 848  * Returns     :  0 if match, else nonzero
 849  *
 850  *********************************************************************/
 851 static int simplematch(const char *pattern, const char *text)
 852 {
 853    const unsigned char *pat = (const unsigned char *)pattern;
 854    const unsigned char *txt = (const unsigned char *)text;
 855    const unsigned char *fallback = pat;
 856    int wildcard = 0;
 857
 858    unsigned char lastchar = 'a';
 859    unsigned i;
 860    unsigned char charmap[32];
 861
 862    while (*txt)
 863    {
 864
 865       /* EOF pattern but !EOF text? */
 866       if (*pat == '\0')
 867       {
 868          if (wildcard)
 869          {
 870             pat = fallback;
 871          }
 872          else
 873          {
 874             return 1;
 875          }
 876       }
 877
 878       /* '*' in the pattern?  */
 879       if (*pat == '*')
 880       {
 881
 882          /* The pattern ends afterwards? Speed up the return. */
 883          if (*++pat == '\0')
 884          {
 885             return 0;
 886          }
 887
 888          /* Else, set wildcard mode and remember position after '*' */
 889          wildcard = 1;
 890          fallback = pat;
 891       }
 892
 893       /* Character range specification? */
 894       if (*pat == '[')
 895       {
 896          memset(charmap, '\0', sizeof(charmap));
 897
 898          while (*++pat != ']')
 899          {
 900             if (!*pat)
 901             {
 902                return 1;
 903             }
 904             else if (*pat == '-')
 905             {
 906                if ((*++pat == ']') || *pat == '\0')
 907                {
 908                   return(1);
 909                }
 910                for (i = lastchar; i <= *pat; i++)
 911                {
 912                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 913                }
 914             }
 915             else
 916             {
 917                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 918                lastchar = *pat;
 919             }
 920          }
 921       } /* -END- if Character range specification */
 922
 923
 924       /*
 925        * Char match, or char range match?
 926        */
 927       if ( (*pat == *txt)
 928       ||   (*pat == '?')
 929       ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
 930       {
 931          /*
 932           * Success: Go ahead
 933           */
 934          pat++;
 935       }
 936       else if (!wildcard)
 937       {
 938          /*
 939           * No match && no wildcard: No luck
 940           */
 941          return 1;
 942       }
 943       else if (pat != fallback)
 944       {
 945          /*
 946           * Increment text pointer if in char range matching
 947           */
 948          if (*pat == ']')
 949          {
 950             txt++;
 951          }
 952          /*
 953           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 954           */
 955          pat = fallback;
 956          /*
 957           * Restart matching from current text pointer
 958           */
 959          continue;
 960       }
 961       txt++;
 962    }
 963
 964    /* Cut off extra '*'s */
 965    if(*pat == '*')  pat++;
 966
 967    /* If this is the pattern's end, fine! */
 968    return(*pat);
 969
 970 }
 971
 972
 973 /*********************************************************************
 974  *
 975  * Function    :  simple_domaincmp
 976  *
 977  * Description :  Domain-wise Compare fqdn's.  The comparison is
 978  *                both left- and right-anchored.  The individual
 979  *                domain names are compared with simplematch().
 980  *                This is only used by domain_match.
 981  *
 982  * Parameters  :
 983  *          1  :  pv = array of patterns to compare
 984  *          2  :  fv = array of domain components to compare
 985  *          3  :  len = length of the arrays (both arrays are the
 986  *                      same length - if they weren't, it couldn't
 987  *                      possibly be a match).
 988  *
 989  * Returns     :  0 => domains are equivalent, else no match.
 990  *
 991  *********************************************************************/
 992 static int simple_domaincmp(char **pv, char **fv, int len)
 993 {
 994    int n;
 995
 996    for (n = 0; n < len; n++)
 997    {
 998       if (simplematch(pv[n], fv[n]))
 999       {
1000          return 1;
1001       }
1002    }
1003
1004    return 0;
1005
1006 }
1007
1008
1009 /*********************************************************************
1010  *
1011  * Function    :  domain_match
1012  *
1013  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1014  *                pattern->unachored, the comparison is un-, left-,
1015  *                right-anchored, or both.
1016  *                The individual domain names are compared with
1017  *                simplematch().
1018  *
1019  * Parameters  :
1020  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1021  *          2  :  fqdn = domain name against which the patterns are compared.
1022  *
1023  * Returns     :  0 => domains are equivalent, else no match.
1024  *
1025  *********************************************************************/
1026 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1027 {
1028    char **pv, **fv;  /* vectors  */
1029    int    plen, flen;
1030    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1031
1032    plen = pattern->dcount;
1033    flen = fqdn->dcount;
1034
1035    if (flen < plen)
1036    {
1037       /* fqdn is too short to match this pattern */
1038       return 1;
1039    }
1040
1041    pv   = pattern->dvec;
1042    fv   = fqdn->dvec;
1043
1044    if (unanchored == ANCHOR_LEFT)
1045    {
1046       /*
1047        * Right anchored.
1048        *
1049        * Convert this into a fully anchored pattern with
1050        * the fqdn and pattern the same length
1051        */
1052       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1053       return simple_domaincmp(pv, fv, plen);
1054    }
1055    else if (unanchored == 0)
1056    {
1057       /* Fully anchored, check length */
1058       if (flen != plen)
1059       {
1060          return 1;
1061       }
1062       return simple_domaincmp(pv, fv, plen);
1063    }
1064    else if (unanchored == ANCHOR_RIGHT)
1065    {
1066       /* Left anchored, ignore all extra in fqdn */
1067       return simple_domaincmp(pv, fv, plen);
1068    }
1069    else
1070    {
1071       /* Unanchored */
1072       int n;
1073       int maxn = flen - plen;
1074       for (n = 0; n <= maxn; n++)
1075       {
1076          if (!simple_domaincmp(pv, fv, plen))
1077          {
1078             return 0;
1079          }
1080          /*
1081           * Doesn't match from start of fqdn
1082           * Try skipping first part of fqdn
1083           */
1084          fv++;
1085       }
1086       return 1;
1087    }
1088
1089 }
1090 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1091
1092
1093 /*********************************************************************
1094  *
1095  * Function    :  create_url_spec
1096  *
1097  * Description :  Creates a "url_spec" structure from a string.
1098  *                When finished, free with free_url_spec().
1099  *
1100  * Parameters  :
1101  *          1  :  url = Target url_spec to be filled in.  Will be
1102  *                      zeroed before use.
1103  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1104  *                      contents of this buffer are destroyed by this
1105  *                      function.  If this function succeeds, the
1106  *                      buffer is copied to url->spec.  If this
1107  *                      function fails, the contents of the buffer
1108  *                      are lost forever.
1109  *
1110  * Returns     :  JB_ERR_OK - Success
1111  *                JB_ERR_MEMORY - Out of memory
1112  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1113  *                               written to system log)
1114  *
1115  *********************************************************************/
1116 jb_err create_url_spec(struct url_spec *url, char *buf)
1117 {
1118    assert(url);
1119    assert(buf);
1120
1121    memset(url, '\0', sizeof(*url));
1122
1123    /* Remember the original specification for the CGI pages. */
1124    url->spec = strdup(buf);
1125    if (NULL == url->spec)
1126    {
1127       return JB_ERR_MEMORY;
1128    }
1129
1130    /* Is it a tag pattern? */
1131    if (0 == strncmpic("TAG:", url->spec, 4))
1132    {
1133       /* The pattern starts with the first character after "TAG:" */
1134       const char *tag_pattern = buf + 4;
1135       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1136    }
1137
1138    /* If it isn't a tag pattern it must be an URL pattern. */
1139    return compile_url_pattern(url, buf);
1140 }
1141
1142
1143 /*********************************************************************
1144  *
1145  * Function    :  free_url_spec
1146  *
1147  * Description :  Called from the "unloaders".  Freez the url
1148  *                structure elements.
1149  *
1150  * Parameters  :
1151  *          1  :  url = pointer to a url_spec structure.
1152  *
1153  * Returns     :  N/A
1154  *
1155  *********************************************************************/
1156 void free_url_spec(struct url_spec *url)
1157 {
1158    if (url == NULL) return;
1159
1160    freez(url->spec);
1161 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1162    if (url->host_regex)
1163    {
1164       regfree(url->host_regex);
1165       freez(url->host_regex);
1166    }
1167 #else
1168    freez(url->dbuffer);
1169    freez(url->dvec);
1170    url->dcount = 0;
1171 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1172    freez(url->port_list);
1173    if (url->preg)
1174    {
1175       regfree(url->preg);
1176       freez(url->preg);
1177    }
1178    if (url->tag_regex)
1179    {
1180       regfree(url->tag_regex);
1181       freez(url->tag_regex);
1182    }
1183 }
1184
1185
1186 /*********************************************************************
1187  *
1188  * Function    :  port_matches
1189  *
1190  * Description :  Compares a port against a port list.
1191  *
1192  * Parameters  :
1193  *          1  :  port      = The port to check.
1194  *          2  :  port_list = The list of port to compare with.
1195  *
1196  * Returns     :  TRUE for yes, FALSE otherwise.
1197  *
1198  *********************************************************************/
1199 static int port_matches(const int port, const char *port_list)
1200 {
1201    return ((NULL == port_list) || match_portlist(port_list, port));
1202 }
1203
1204
1205 /*********************************************************************
1206  *
1207  * Function    :  host_matches
1208  *
1209  * Description :  Compares a host against a host pattern.
1210  *
1211  * Parameters  :
1212  *          1  :  url = The URL to match
1213  *          2  :  pattern = The URL pattern
1214  *
1215  * Returns     :  TRUE for yes, FALSE otherwise.
1216  *
1217  *********************************************************************/
1218 static int host_matches(const struct http_request *http,
1219                         const struct url_spec *pattern)
1220 {
1221 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1222    return ((NULL == pattern->host_regex)
1223       || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
1224 #else
1225    return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
1226 #endif
1227 }
1228
1229
1230 /*********************************************************************
1231  *
1232  * Function    :  path_matches
1233  *
1234  * Description :  Compares a path against a path pattern.
1235  *
1236  * Parameters  :
1237  *          1  :  path = The path to match
1238  *          2  :  pattern = The URL pattern
1239  *
1240  * Returns     :  TRUE for yes, FALSE otherwise.
1241  *
1242  *********************************************************************/
1243 static int path_matches(const char *path, const struct url_spec *pattern)
1244 {
1245    return ((NULL == pattern->preg)
1246       || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
1247 }
1248
1249
1250 /*********************************************************************
1251  *
1252  * Function    :  url_match
1253  *
1254  * Description :  Compare a URL against a URL pattern.
1255  *
1256  * Parameters  :
1257  *          1  :  pattern = a URL pattern
1258  *          2  :  url = URL to match
1259  *
1260  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1261  *
1262  *********************************************************************/
1263 int url_match(const struct url_spec *pattern,
1264               const struct http_request *http)
1265 {
1266    if (pattern->tag_regex != NULL)
1267    {
1268       /* It's a tag pattern and shouldn't be matched against URLs */
1269       return 0;
1270    }
1271
1272    return (port_matches(http->port, pattern->port_list)
1273       && host_matches(http, pattern) && path_matches(http->path, pattern));
1274
1275 }
1276
1277
1278 /*********************************************************************
1279  *
1280  * Function    :  match_portlist
1281  *
1282  * Description :  Check if a given number is covered by a comma
1283  *                separated list of numbers and ranges (a,b-c,d,..)
1284  *
1285  * Parameters  :
1286  *          1  :  portlist = String with list
1287  *          2  :  port = port to check
1288  *
1289  * Returns     :  0 => no match
1290  *                1 => match
1291  *
1292  *********************************************************************/
1293 int match_portlist(const char *portlist, int port)
1294 {
1295    char *min, *max, *next, *portlist_copy;
1296
1297    min = portlist_copy = strdup(portlist);
1298
1299    /*
1300     * Zero-terminate first item and remember offset for next
1301     */
1302    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1303    {
1304       *next++ = '\0';
1305    }
1306
1307    /*
1308     * Loop through all items, checking for match
1309     */
1310    while (NULL != min)
1311    {
1312       if (NULL == (max = strchr(min, (int) '-')))
1313       {
1314          /*
1315           * No dash, check for equality
1316           */
1317          if (port == atoi(min))
1318          {
1319             freez(portlist_copy);
1320             return(1);
1321          }
1322       }
1323       else
1324       {
1325          /*
1326           * This is a range, so check if between min and max,
1327           * or, if max was omitted, between min and 65K
1328           */
1329          *max++ = '\0';
1330          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1331          {
1332             freez(portlist_copy);
1333             return(1);
1334          }
1335
1336       }
1337
1338       /*
1339        * Jump to next item
1340        */
1341       min = next;
1342
1343       /*
1344        * Zero-terminate next item and remember offset for n+1
1345        */
1346       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1347       {
1348          *next++ = '\0';
1349       }
1350    }
1351
1352    freez(portlist_copy);
1353    return 0;
1354
1355 }
1356
1357
1358 /*********************************************************************
1359  *
1360  * Function    :  parse_forwarder_address
1361  *
1362  * Description :  Parse out the host and port from a forwarder address.
1363  *
1364  * Parameters  :
1365  *          1  :  address = The forwarder address to parse.
1366  *          2  :  hostname = Used to return the hostname. NULL on error.
1367  *          3  :  port = Used to return the port. Untouched if no port
1368  *                       is specified.
1369  *
1370  * Returns     :  JB_ERR_OK on success
1371  *                JB_ERR_MEMORY on out of memory
1372  *                JB_ERR_PARSE on malformed address.
1373  *
1374  *********************************************************************/
1375 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1376 {
1377    char *p = address;
1378
1379    if ((*address == '[') && (NULL == strchr(address, ']')))
1380    {
1381       /* XXX: Should do some more validity checks here. */
1382       return JB_ERR_PARSE;
1383    }
1384
1385    *hostname = strdup(address);
1386    if (NULL == *hostname)
1387    {
1388       return JB_ERR_MEMORY;
1389    }
1390
1391    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1392    {
1393       *p++ = '\0';
1394       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1395       if (*p == ':')
1396       {
1397          *port = (int)strtol(++p, NULL, 0);
1398       }
1399    }
1400    else if (NULL != (p = strchr(*hostname, ':')))
1401    {
1402       *p++ = '\0';
1403       *port = (int)strtol(p, NULL, 0);
1404    }
1405
1406    return JB_ERR_OK;
1407
1408 }
1409
1410
1411 /*
1412   Local Variables:
1413   tab-width: 3
1414   end:
1415 */