urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.52 2009/05/19 17:39:59 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2009
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring {NO_ANCHORING, LEFT_ANCHORED, RIGHT_ANCHORED};
  62 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  63
  64 /*********************************************************************
  65  *
  66  * Function    :  free_http_request
  67  *
  68  * Description :  Freez a http_request structure
  69  *
  70  * Parameters  :
  71  *          1  :  http = points to a http_request structure to free
  72  *
  73  * Returns     :  N/A
  74  *
  75  *********************************************************************/
  76 void free_http_request(struct http_request *http)
  77 {
  78    assert(http);
  79
  80    freez(http->cmd);
  81    freez(http->ocmd);
  82    freez(http->gpc);
  83    freez(http->host);
  84    freez(http->url);
  85    freez(http->hostport);
  86    freez(http->path);
  87    freez(http->ver);
  88    freez(http->host_ip_addr_str);
  89    freez(http->dbuffer);
  90    freez(http->dvec);
  91    http->dcount = 0;
  92 }
  93
  94
  95 /*********************************************************************
  96  *
  97  * Function    :  init_domain_components
  98  *
  99  * Description :  Splits the domain name so we can compare it
 100  *                against wildcards. It used to be part of
 101  *                parse_http_url, but was separated because the
 102  *                same code is required in chat in case of
 103  *                intercepted requests.
 104  *
 105  * Parameters  :
 106  *          1  :  http = pointer to the http structure to hold elements.
 107  *
 108  * Returns     :  JB_ERR_OK on success
 109  *                JB_ERR_MEMORY on out of memory
 110  *                JB_ERR_PARSE on malformed command/URL
 111  *                             or >100 domains deep.
 112  *
 113  *********************************************************************/
 114 jb_err init_domain_components(struct http_request *http)
 115 {
 116    char *vec[BUFFER_SIZE];
 117    size_t size;
 118    char *p;
 119
 120    http->dbuffer = strdup(http->host);
 121    if (NULL == http->dbuffer)
 122    {
 123       return JB_ERR_MEMORY;
 124    }
 125
 126    /* map to lower case */
 127    for (p = http->dbuffer; *p ; p++)
 128    {
 129       *p = (char)tolower((int)(unsigned char)*p);
 130    }
 131
 132    /* split the domain name into components */
 133    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 134
 135    if (http->dcount <= 0)
 136    {
 137       /*
 138        * Error: More than SZ(vec) components in domain
 139        *    or: no components in domain
 140        */
 141       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 142       return JB_ERR_PARSE;
 143    }
 144
 145    /* save a copy of the pointers in dvec */
 146    size = (size_t)http->dcount * sizeof(*http->dvec);
 147
 148    http->dvec = (char **)malloc(size);
 149    if (NULL == http->dvec)
 150    {
 151       return JB_ERR_MEMORY;
 152    }
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158
 159
 160 /*********************************************************************
 161  *
 162  * Function    :  parse_http_url
 163  *
 164  * Description :  Parse out the host and port from the URL.  Find the
 165  *                hostname & path, port (if ':'), and/or password (if '@')
 166  *
 167  * Parameters  :
 168  *          1  :  url = URL (or is it URI?) to break down
 169  *          2  :  http = pointer to the http structure to hold elements.
 170  *                       Must be initialized with valid values (like NULLs).
 171  *          3  :  require_protocol = Whether or not URLs without
 172  *                                   protocol are acceptable.
 173  *
 174  * Returns     :  JB_ERR_OK on success
 175  *                JB_ERR_MEMORY on out of memory
 176  *                JB_ERR_PARSE on malformed command/URL
 177  *                             or >100 domains deep.
 178  *
 179  *********************************************************************/
 180 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 181 {
 182    int host_available = 1; /* A proxy can dream. */
 183
 184    /*
 185     * Save our initial URL
 186     */
 187    http->url = strdup(url);
 188    if (http->url == NULL)
 189    {
 190       return JB_ERR_MEMORY;
 191    }
 192
 193
 194    /*
 195     * Check for * URI. If found, we're done.
 196     */
 197    if (*http->url == '*')
 198    {
 199       if  ( NULL == (http->path = strdup("*"))
 200          || NULL == (http->hostport = strdup("")) )
 201       {
 202          return JB_ERR_MEMORY;
 203       }
 204       if (http->url[1] != '\0')
 205       {
 206          return JB_ERR_PARSE;
 207       }
 208       return JB_ERR_OK;
 209    }
 210
 211
 212    /*
 213     * Split URL into protocol,hostport,path.
 214     */
 215    {
 216       char *buf;
 217       char *url_noproto;
 218       char *url_path;
 219
 220       buf = strdup(url);
 221       if (buf == NULL)
 222       {
 223          return JB_ERR_MEMORY;
 224       }
 225
 226       /* Find the start of the URL in our scratch space */
 227       url_noproto = buf;
 228       if (strncmpic(url_noproto, "http://",  7) == 0)
 229       {
 230          url_noproto += 7;
 231       }
 232       else if (strncmpic(url_noproto, "https://", 8) == 0)
 233       {
 234          /*
 235           * Should only happen when called from cgi_show_url_info().
 236           */
 237          url_noproto += 8;
 238          http->ssl = 1;
 239       }
 240       else if (*url_noproto == '/')
 241       {
 242         /*
 243          * Short request line without protocol and host.
 244          * Most likely because the client's request
 245          * was intercepted and redirected into Privoxy.
 246          */
 247          http->host = NULL;
 248          host_available = 0;
 249       }
 250       else if (require_protocol)
 251       {
 252          freez(buf);
 253          return JB_ERR_PARSE;
 254       }
 255
 256       url_path = strchr(url_noproto, '/');
 257       if (url_path != NULL)
 258       {
 259          /*
 260           * Got a path.
 261           *
 262           * NOTE: The following line ignores the path for HTTPS URLS.
 263           * This means that you get consistent behaviour if you type a
 264           * https URL in and it's parsed by the function.  (When the
 265           * URL is actually retrieved, SSL hides the path part).
 266           */
 267          http->path = strdup(http->ssl ? "/" : url_path);
 268          *url_path = '\0';
 269          http->hostport = strdup(url_noproto);
 270       }
 271       else
 272       {
 273          /*
 274           * Repair broken HTTP requests that don't contain a path,
 275           * or CONNECT requests
 276           */
 277          http->path = strdup("/");
 278          http->hostport = strdup(url_noproto);
 279       }
 280
 281       freez(buf);
 282
 283       if ( (http->path == NULL)
 284         || (http->hostport == NULL))
 285       {
 286          return JB_ERR_MEMORY;
 287       }
 288    }
 289
 290    if (!host_available)
 291    {
 292       /* Without host, there is nothing left to do here */
 293       return JB_ERR_OK;
 294    }
 295
 296    /*
 297     * Split hostport into user/password (ignored), host, port.
 298     */
 299    {
 300       char *buf;
 301       char *host;
 302       char *port;
 303
 304       buf = strdup(http->hostport);
 305       if (buf == NULL)
 306       {
 307          return JB_ERR_MEMORY;
 308       }
 309
 310       /* check if url contains username and/or password */
 311       host = strchr(buf, '@');
 312       if (host != NULL)
 313       {
 314          /* Contains username/password, skip it and the @ sign. */
 315          host++;
 316       }
 317       else
 318       {
 319          /* No username or password. */
 320          host = buf;
 321       }
 322
 323       /* Move after hostname before port number */
 324       if (*host == '[')
 325       {
 326          /* Numeric IPv6 address delimited by brackets */
 327          host++;
 328          port = strchr(host, ']');
 329
 330          if (port == NULL)
 331          {
 332             /* Missing closing bracket */
 333             freez(buf);
 334             return JB_ERR_PARSE;
 335          }
 336
 337          *port++ = '\0';
 338
 339          if (*port == '\0')
 340          {
 341             port = NULL;
 342          }
 343          else if (*port != ':')
 344          {
 345             /* Garbage after closing bracket */
 346             freez(buf);
 347             return JB_ERR_PARSE;
 348          }
 349       }
 350       else
 351       {
 352          /* Plain non-escaped hostname */
 353          port = strchr(host, ':');
 354       }
 355
 356       /* check if url contains port */
 357       if (port != NULL)
 358       {
 359          /* Contains port */
 360          /* Terminate hostname and point to start of port string */
 361          *port++ = '\0';
 362          http->port = atoi(port);
 363       }
 364       else
 365       {
 366          /* No port specified. */
 367          http->port = (http->ssl ? 443 : 80);
 368       }
 369
 370       http->host = strdup(host);
 371
 372       freez(buf);
 373
 374       if (http->host == NULL)
 375       {
 376          return JB_ERR_MEMORY;
 377       }
 378    }
 379
 380    /*
 381     * Split domain name so we can compare it against wildcards
 382     */
 383    return init_domain_components(http);
 384
 385 }
 386
 387
 388 /*********************************************************************
 389  *
 390  * Function    :  unknown_method
 391  *
 392  * Description :  Checks whether a method is unknown.
 393  *
 394  * Parameters  :
 395  *          1  :  method = points to a http method
 396  *
 397  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 398  *
 399  *********************************************************************/
 400 static int unknown_method(const char *method)
 401 {
 402    static const char *known_http_methods[] = {
 403       /* Basic HTTP request type */
 404       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 405       /* webDAV extensions (RFC2518) */
 406       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 407       /*
 408        * Microsoft webDAV extension for Exchange 2000.  See:
 409        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 410        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 411        */
 412       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 413       /*
 414        * Another Microsoft webDAV extension for Exchange 2000.  See:
 415        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 416        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 417        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 418        */
 419       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 420       /*
 421        * Yet another WebDAV extension, this time for
 422        * Web Distributed Authoring and Versioning (RFC3253)
 423        */
 424       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 425       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 426    };
 427    int i;
 428
 429    for (i = 0; i < SZ(known_http_methods); i++)
 430    {
 431       if (0 == strcmpic(method, known_http_methods[i]))
 432       {
 433          return FALSE;
 434       }
 435    }
 436
 437    return TRUE;
 438
 439 }
 440
 441
 442 /*********************************************************************
 443  *
 444  * Function    :  parse_http_request
 445  *
 446  * Description :  Parse out the host and port from the URL.  Find the
 447  *                hostname & path, port (if ':'), and/or password (if '@')
 448  *
 449  * Parameters  :
 450  *          1  :  req = HTTP request line to break down
 451  *          2  :  http = pointer to the http structure to hold elements
 452  *
 453  * Returns     :  JB_ERR_OK on success
 454  *                JB_ERR_MEMORY on out of memory
 455  *                JB_ERR_CGI_PARAMS on malformed command/URL
 456  *                                  or >100 domains deep.
 457  *
 458  *********************************************************************/
 459 jb_err parse_http_request(const char *req, struct http_request *http)
 460 {
 461    char *buf;
 462    char *v[10]; /* XXX: Why 10? We should only need three. */
 463    int n;
 464    jb_err err;
 465
 466    memset(http, '\0', sizeof(*http));
 467
 468    buf = strdup(req);
 469    if (buf == NULL)
 470    {
 471       return JB_ERR_MEMORY;
 472    }
 473
 474    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 475    if (n != 3)
 476    {
 477       freez(buf);
 478       return JB_ERR_PARSE;
 479    }
 480
 481    /*
 482     * Fail in case of unknown methods
 483     * which we might not handle correctly.
 484     *
 485     * XXX: There should be a config option
 486     * to forward requests with unknown methods
 487     * anyway. Most of them don't need special
 488     * steps.
 489     */
 490    if (unknown_method(v[0]))
 491    {
 492       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 493       freez(buf);
 494       return JB_ERR_PARSE;
 495    }
 496
 497    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 498    {
 499       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 500          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 501       freez(buf);
 502       return JB_ERR_PARSE;
 503    }
 504
 505    http->ssl = !strcmpic(v[0], "CONNECT");
 506
 507    err = parse_http_url(v[1], http, !http->ssl);
 508    if (err)
 509    {
 510       freez(buf);
 511       return err;
 512    }
 513
 514    /*
 515     * Copy the details into the structure
 516     */
 517    http->cmd = strdup(req);
 518    http->gpc = strdup(v[0]);
 519    http->ver = strdup(v[2]);
 520
 521    freez(buf);
 522
 523    if ( (http->cmd == NULL)
 524      || (http->gpc == NULL)
 525      || (http->ver == NULL) )
 526    {
 527       return JB_ERR_MEMORY;
 528    }
 529
 530    return JB_ERR_OK;
 531
 532 }
 533
 534
 535 /*********************************************************************
 536  *
 537  * Function    :  compile_pattern
 538  *
 539  * Description :  Compiles a host, domain or TAG pattern.
 540  *
 541  * Parameters  :
 542  *          1  :  pattern = The pattern to compile.
 543  *          2  :  anchoring = How the regex should be anchored.
 544  *                            Can be either one of NO_ANCHORING,
 545  *                            LEFT_ANCHORED or RIGHT_ANCHORED.
 546  *          3  :  url     = In case of failures, the spec member is
 547  *                          logged and the structure freed.
 548  *          4  :  regex   = Where the compiled regex should be stored.
 549  *
 550  * Returns     :  JB_ERR_OK - Success
 551  *                JB_ERR_MEMORY - Out of memory
 552  *                JB_ERR_PARSE - Cannot parse regex
 553  *
 554  *********************************************************************/
 555 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 556                               struct url_spec *url, regex_t **regex)
 557 {
 558    int errcode;
 559    char rebuf[BUFFER_SIZE];
 560    const char *fmt = NULL;
 561
 562    assert(pattern);
 563    assert(strlen(pattern) < sizeof(rebuf) - 2);
 564
 565    if (pattern[0] == '\0')
 566    {
 567       *regex = NULL;
 568       return JB_ERR_OK;
 569    }
 570
 571    switch (anchoring)
 572    {
 573       case NO_ANCHORING:
 574          fmt = "%s";
 575          break;
 576       case RIGHT_ANCHORED:
 577          fmt = "%s$";
 578          break;
 579       case LEFT_ANCHORED:
 580          fmt = "^%s";
 581          break;
 582       default:
 583          log_error(LOG_LEVEL_FATAL,
 584             "Invalid anchoring in compile_pattern %d", anchoring);
 585    }
 586
 587    *regex = zalloc(sizeof(**regex));
 588    if (NULL == *regex)
 589    {
 590       free_url_spec(url);
 591       return JB_ERR_MEMORY;
 592    }
 593
 594    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 595
 596    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 597
 598    if (errcode)
 599    {
 600       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 601       if (errlen > (sizeof(rebuf) - (size_t)1))
 602       {
 603          errlen = sizeof(rebuf) - (size_t)1;
 604       }
 605       rebuf[errlen] = '\0';
 606       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 607          pattern, url->spec, rebuf);
 608       free_url_spec(url);
 609
 610       return JB_ERR_PARSE;
 611    }
 612
 613    return JB_ERR_OK;
 614
 615 }
 616
 617
 618 /*********************************************************************
 619  *
 620  * Function    :  compile_url_pattern
 621  *
 622  * Description :  Compiles the three parts of an URL pattern.
 623  *
 624  * Parameters  :
 625  *          1  :  url = Target url_spec to be filled in.
 626  *          2  :  buf = The url pattern to compile. Will be messed up.
 627  *
 628  * Returns     :  JB_ERR_OK - Success
 629  *                JB_ERR_MEMORY - Out of memory
 630  *                JB_ERR_PARSE - Cannot parse regex
 631  *
 632  *********************************************************************/
 633 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 634 {
 635    char *p;
 636
 637    p = strchr(buf, '/');
 638    if (NULL != p)
 639    {
 640       /*
 641        * Only compile the regex if it consists of more than
 642        * a single slash, otherwise it wouldn't affect the result.
 643        */
 644       if (p[1] != '\0')
 645       {
 646          /*
 647           * XXX: does it make sense to compile the slash at the beginning?
 648           */
 649          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 650
 651          if (JB_ERR_OK != err)
 652          {
 653             return err;
 654          }
 655       }
 656       *p = '\0';
 657    }
 658
 659    /*
 660     * IPv6 numeric hostnames can contain colons, thus we need
 661     * to delimit the hostname before the real port separator.
 662     * As brackets are already used in the hostname pattern,
 663     * we use angle brackets ('<', '>') instead.
 664     */
 665    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 666    {
 667       *p++ = '\0';
 668       buf++;
 669
 670       if (*p == '\0')
 671       {
 672          /* IPv6 address without port number */
 673          p = NULL;
 674       }
 675       else if (*p != ':')
 676       {
 677          /* Garbage after address delimiter */
 678          return JB_ERR_PARSE;
 679       }
 680    }
 681    else
 682    {
 683       p = strchr(buf, ':');
 684    }
 685
 686    if (NULL != p)
 687    {
 688       *p++ = '\0';
 689       url->port_list = strdup(p);
 690       if (NULL == url->port_list)
 691       {
 692          return JB_ERR_MEMORY;
 693       }
 694    }
 695    else
 696    {
 697       url->port_list = NULL;
 698    }
 699
 700    if (buf[0] != '\0')
 701    {
 702       return compile_host_pattern(url, buf);
 703    }
 704
 705    return JB_ERR_OK;
 706
 707 }
 708
 709
 710 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 711 /*********************************************************************
 712  *
 713  * Function    :  compile_host_pattern
 714  *
 715  * Description :  Parses and compiles a host pattern..
 716  *
 717  * Parameters  :
 718  *          1  :  url = Target url_spec to be filled in.
 719  *          2  :  host_pattern = Host pattern to compile.
 720  *
 721  * Returns     :  JB_ERR_OK - Success
 722  *                JB_ERR_MEMORY - Out of memory
 723  *                JB_ERR_PARSE - Cannot parse regex
 724  *
 725  *********************************************************************/
 726 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 727 {
 728    return compile_pattern(host_pattern, RIGHT_ANCHORED, url, &url->host_regex);
 729 }
 730
 731 #else
 732
 733 /*********************************************************************
 734  *
 735  * Function    :  compile_host_pattern
 736  *
 737  * Description :  Parses and "compiles" an old-school host pattern.
 738  *
 739  * Parameters  :
 740  *          1  :  url = Target url_spec to be filled in.
 741  *          2  :  host_pattern = Host pattern to parse.
 742  *
 743  * Returns     :  JB_ERR_OK - Success
 744  *                JB_ERR_MEMORY - Out of memory
 745  *                JB_ERR_PARSE - Cannot parse regex
 746  *
 747  *********************************************************************/
 748 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 749 {
 750    char *v[150];
 751    size_t size;
 752    char *p;
 753
 754    /*
 755     * Parse domain part
 756     */
 757    if (host_pattern[strlen(host_pattern) - 1] == '.')
 758    {
 759       url->unanchored |= ANCHOR_RIGHT;
 760    }
 761    if (host_pattern[0] == '.')
 762    {
 763       url->unanchored |= ANCHOR_LEFT;
 764    }
 765
 766    /*
 767     * Split domain into components
 768     */
 769    url->dbuffer = strdup(host_pattern);
 770    if (NULL == url->dbuffer)
 771    {
 772       free_url_spec(url);
 773       return JB_ERR_MEMORY;
 774    }
 775
 776    /*
 777     * Map to lower case
 778     */
 779    for (p = url->dbuffer; *p ; p++)
 780    {
 781       *p = (char)tolower((int)(unsigned char)*p);
 782    }
 783
 784    /*
 785     * Split the domain name into components
 786     */
 787    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 788
 789    if (url->dcount < 0)
 790    {
 791       free_url_spec(url);
 792       return JB_ERR_MEMORY;
 793    }
 794    else if (url->dcount != 0)
 795    {
 796       /*
 797        * Save a copy of the pointers in dvec
 798        */
 799       size = (size_t)url->dcount * sizeof(*url->dvec);
 800
 801       url->dvec = (char **)malloc(size);
 802       if (NULL == url->dvec)
 803       {
 804          free_url_spec(url);
 805          return JB_ERR_MEMORY;
 806       }
 807
 808       memcpy(url->dvec, v, size);
 809    }
 810    /*
 811     * else dcount == 0 in which case we needn't do anything,
 812     * since dvec will never be accessed and the pattern will
 813     * match all domains.
 814     */
 815    return JB_ERR_OK;
 816 }
 817
 818
 819 /*********************************************************************
 820  *
 821  * Function    :  simplematch
 822  *
 823  * Description :  String matching, with a (greedy) '*' wildcard that
 824  *                stands for zero or more arbitrary characters and
 825  *                character classes in [], which take both enumerations
 826  *                and ranges.
 827  *
 828  * Parameters  :
 829  *          1  :  pattern = pattern for matching
 830  *          2  :  text    = text to be matched
 831  *
 832  * Returns     :  0 if match, else nonzero
 833  *
 834  *********************************************************************/
 835 static int simplematch(const char *pattern, const char *text)
 836 {
 837    const unsigned char *pat = (const unsigned char *)pattern;
 838    const unsigned char *txt = (const unsigned char *)text;
 839    const unsigned char *fallback = pat;
 840    int wildcard = 0;
 841
 842    unsigned char lastchar = 'a';
 843    unsigned i;
 844    unsigned char charmap[32];
 845
 846    while (*txt)
 847    {
 848
 849       /* EOF pattern but !EOF text? */
 850       if (*pat == '\0')
 851       {
 852          if (wildcard)
 853          {
 854             pat = fallback;
 855          }
 856          else
 857          {
 858             return 1;
 859          }
 860       }
 861
 862       /* '*' in the pattern?  */
 863       if (*pat == '*')
 864       {
 865
 866          /* The pattern ends afterwards? Speed up the return. */
 867          if (*++pat == '\0')
 868          {
 869             return 0;
 870          }
 871
 872          /* Else, set wildcard mode and remember position after '*' */
 873          wildcard = 1;
 874          fallback = pat;
 875       }
 876
 877       /* Character range specification? */
 878       if (*pat == '[')
 879       {
 880          memset(charmap, '\0', sizeof(charmap));
 881
 882          while (*++pat != ']')
 883          {
 884             if (!*pat)
 885             {
 886                return 1;
 887             }
 888             else if (*pat == '-')
 889             {
 890                if ((*++pat == ']') || *pat == '\0')
 891                {
 892                   return(1);
 893                }
 894                for (i = lastchar; i <= *pat; i++)
 895                {
 896                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 897                }
 898             }
 899             else
 900             {
 901                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 902                lastchar = *pat;
 903             }
 904          }
 905       } /* -END- if Character range specification */
 906
 907
 908       /*
 909        * Char match, or char range match?
 910        */
 911       if ( (*pat == *txt)
 912       ||   (*pat == '?')
 913       ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
 914       {
 915          /*
 916           * Sucess: Go ahead
 917           */
 918          pat++;
 919       }
 920       else if (!wildcard)
 921       {
 922          /*
 923           * No match && no wildcard: No luck
 924           */
 925          return 1;
 926       }
 927       else if (pat != fallback)
 928       {
 929          /*
 930           * Increment text pointer if in char range matching
 931           */
 932          if (*pat == ']')
 933          {
 934             txt++;
 935          }
 936          /*
 937           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 938           */
 939          pat = fallback;
 940          /*
 941           * Restart matching from current text pointer
 942           */
 943          continue;
 944       }
 945       txt++;
 946    }
 947
 948    /* Cut off extra '*'s */
 949    if(*pat == '*')  pat++;
 950
 951    /* If this is the pattern's end, fine! */
 952    return(*pat);
 953
 954 }
 955
 956
 957 /*********************************************************************
 958  *
 959  * Function    :  simple_domaincmp
 960  *
 961  * Description :  Domain-wise Compare fqdn's.  The comparison is
 962  *                both left- and right-anchored.  The individual
 963  *                domain names are compared with simplematch().
 964  *                This is only used by domain_match.
 965  *
 966  * Parameters  :
 967  *          1  :  pv = array of patterns to compare
 968  *          2  :  fv = array of domain components to compare
 969  *          3  :  len = length of the arrays (both arrays are the
 970  *                      same length - if they weren't, it couldn't
 971  *                      possibly be a match).
 972  *
 973  * Returns     :  0 => domains are equivalent, else no match.
 974  *
 975  *********************************************************************/
 976 static int simple_domaincmp(char **pv, char **fv, int len)
 977 {
 978    int n;
 979
 980    for (n = 0; n < len; n++)
 981    {
 982       if (simplematch(pv[n], fv[n]))
 983       {
 984          return 1;
 985       }
 986    }
 987
 988    return 0;
 989
 990 }
 991
 992
 993 /*********************************************************************
 994  *
 995  * Function    :  domain_match
 996  *
 997  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 998  *                pattern->unachored, the comparison is un-, left-,
 999  *                right-anchored, or both.
1000  *                The individual domain names are compared with
1001  *                simplematch().
1002  *
1003  * Parameters  :
1004  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1005  *          2  :  fqdn = domain name against which the patterns are compared.
1006  *
1007  * Returns     :  0 => domains are equivalent, else no match.
1008  *
1009  *********************************************************************/
1010 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1011 {
1012    char **pv, **fv;  /* vectors  */
1013    int    plen, flen;
1014    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1015
1016    plen = pattern->dcount;
1017    flen = fqdn->dcount;
1018
1019    if (flen < plen)
1020    {
1021       /* fqdn is too short to match this pattern */
1022       return 1;
1023    }
1024
1025    pv   = pattern->dvec;
1026    fv   = fqdn->dvec;
1027
1028    if (unanchored == ANCHOR_LEFT)
1029    {
1030       /*
1031        * Right anchored.
1032        *
1033        * Convert this into a fully anchored pattern with
1034        * the fqdn and pattern the same length
1035        */
1036       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1037       return simple_domaincmp(pv, fv, plen);
1038    }
1039    else if (unanchored == 0)
1040    {
1041       /* Fully anchored, check length */
1042       if (flen != plen)
1043       {
1044          return 1;
1045       }
1046       return simple_domaincmp(pv, fv, plen);
1047    }
1048    else if (unanchored == ANCHOR_RIGHT)
1049    {
1050       /* Left anchored, ignore all extra in fqdn */
1051       return simple_domaincmp(pv, fv, plen);
1052    }
1053    else
1054    {
1055       /* Unanchored */
1056       int n;
1057       int maxn = flen - plen;
1058       for (n = 0; n <= maxn; n++)
1059       {
1060          if (!simple_domaincmp(pv, fv, plen))
1061          {
1062             return 0;
1063          }
1064          /*
1065           * Doesn't match from start of fqdn
1066           * Try skipping first part of fqdn
1067           */
1068          fv++;
1069       }
1070       return 1;
1071    }
1072
1073 }
1074 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1075
1076
1077 /*********************************************************************
1078  *
1079  * Function    :  create_url_spec
1080  *
1081  * Description :  Creates a "url_spec" structure from a string.
1082  *                When finished, free with free_url_spec().
1083  *
1084  * Parameters  :
1085  *          1  :  url = Target url_spec to be filled in.  Will be
1086  *                      zeroed before use.
1087  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1088  *                      contents of this buffer are destroyed by this
1089  *                      function.  If this function succeeds, the
1090  *                      buffer is copied to url->spec.  If this
1091  *                      function fails, the contents of the buffer
1092  *                      are lost forever.
1093  *
1094  * Returns     :  JB_ERR_OK - Success
1095  *                JB_ERR_MEMORY - Out of memory
1096  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1097  *                               written to system log)
1098  *
1099  *********************************************************************/
1100 jb_err create_url_spec(struct url_spec *url, char *buf)
1101 {
1102    assert(url);
1103    assert(buf);
1104
1105    memset(url, '\0', sizeof(*url));
1106
1107    /* Remember the original specification for the CGI pages. */
1108    url->spec = strdup(buf);
1109    if (NULL == url->spec)
1110    {
1111       return JB_ERR_MEMORY;
1112    }
1113
1114    /* Is it tag pattern? */
1115    if (0 == strncmpic("TAG:", url->spec, 4))
1116    {
1117       /* The pattern starts with the first character after "TAG:" */
1118       const char *tag_pattern = buf + 4;
1119       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1120    }
1121
1122    /* If it isn't a tag pattern it must be a URL pattern. */
1123    return compile_url_pattern(url, buf);
1124 }
1125
1126
1127 /*********************************************************************
1128  *
1129  * Function    :  free_url_spec
1130  *
1131  * Description :  Called from the "unloaders".  Freez the url
1132  *                structure elements.
1133  *
1134  * Parameters  :
1135  *          1  :  url = pointer to a url_spec structure.
1136  *
1137  * Returns     :  N/A
1138  *
1139  *********************************************************************/
1140 void free_url_spec(struct url_spec *url)
1141 {
1142    if (url == NULL) return;
1143
1144    freez(url->spec);
1145 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1146    if (url->host_regex)
1147    {
1148       regfree(url->host_regex);
1149       freez(url->host_regex);
1150    }
1151 #else
1152    freez(url->dbuffer);
1153    freez(url->dvec);
1154    url->dcount = 0;
1155 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1156    freez(url->port_list);
1157    if (url->preg)
1158    {
1159       regfree(url->preg);
1160       freez(url->preg);
1161    }
1162    if (url->tag_regex)
1163    {
1164       regfree(url->tag_regex);
1165       freez(url->tag_regex);
1166    }
1167 }
1168
1169
1170 /*********************************************************************
1171  *
1172  * Function    :  url_match
1173  *
1174  * Description :  Compare a URL against a URL pattern.
1175  *
1176  * Parameters  :
1177  *          1  :  pattern = a URL pattern
1178  *          2  :  url = URL to match
1179  *
1180  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1181  *
1182  *********************************************************************/
1183 int url_match(const struct url_spec *pattern,
1184               const struct http_request *http)
1185 {
1186    /* XXX: these should probably be functions. */
1187 #define PORT_MATCHES ((NULL == pattern->port_list) || match_portlist(pattern->port_list, http->port))
1188 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1189 #define DOMAIN_MATCHES ((NULL == pattern->host_regex) || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)))
1190 #else
1191 #define DOMAIN_MATCHES ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)))
1192 #endif
1193 #define PATH_MATCHES ((NULL == pattern->preg) || (0 == regexec(pattern->preg, http->path, 0, NULL, 0)))
1194
1195    if (pattern->tag_regex != NULL)
1196    {
1197       /* It's a tag pattern and shouldn't be matched against URLs */
1198       return 0;
1199    }
1200
1201    return (PORT_MATCHES && DOMAIN_MATCHES && PATH_MATCHES);
1202
1203 }
1204
1205
1206 /*********************************************************************
1207  *
1208  * Function    :  match_portlist
1209  *
1210  * Description :  Check if a given number is covered by a comma
1211  *                separated list of numbers and ranges (a,b-c,d,..)
1212  *
1213  * Parameters  :
1214  *          1  :  portlist = String with list
1215  *          2  :  port = port to check
1216  *
1217  * Returns     :  0 => no match
1218  *                1 => match
1219  *
1220  *********************************************************************/
1221 int match_portlist(const char *portlist, int port)
1222 {
1223    char *min, *max, *next, *portlist_copy;
1224
1225    min = portlist_copy = strdup(portlist);
1226
1227    /*
1228     * Zero-terminate first item and remember offset for next
1229     */
1230    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1231    {
1232       *next++ = '\0';
1233    }
1234
1235    /*
1236     * Loop through all items, checking for match
1237     */
1238    while (NULL != min)
1239    {
1240       if (NULL == (max = strchr(min, (int) '-')))
1241       {
1242          /*
1243           * No dash, check for equality
1244           */
1245          if (port == atoi(min))
1246          {
1247             freez(portlist_copy);
1248             return(1);
1249          }
1250       }
1251       else
1252       {
1253          /*
1254           * This is a range, so check if between min and max,
1255           * or, if max was omitted, between min and 65K
1256           */
1257          *max++ = '\0';
1258          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1259          {
1260             freez(portlist_copy);
1261             return(1);
1262          }
1263
1264       }
1265
1266       /*
1267        * Jump to next item
1268        */
1269       min = next;
1270
1271       /*
1272        * Zero-terminate next item and remember offset for n+1
1273        */
1274       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1275       {
1276          *next++ = '\0';
1277       }
1278    }
1279
1280    freez(portlist_copy);
1281    return 0;
1282
1283 }
1284
1285
1286 /*********************************************************************
1287  *
1288  * Function    :  parse_forwarder_address
1289  *
1290  * Description :  Parse out the host and port from a forwarder address.
1291  *
1292  * Parameters  :
1293  *          1  :  address = The forwarder address to parse.
1294  *          2  :  hostname = Used to return the hostname. NULL on error.
1295  *          3  :  port = Used to return the port. Untouched if no port
1296  *                       is specified.
1297  *
1298  * Returns     :  JB_ERR_OK on success
1299  *                JB_ERR_MEMORY on out of memory
1300  *                JB_ERR_PARSE on malformed address.
1301  *
1302  *********************************************************************/
1303 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1304 {
1305    char *p = address;
1306
1307    if ((*address == '[') && (NULL == strchr(address, ']')))
1308    {
1309       /* XXX: Should do some more validity checks here. */
1310       return JB_ERR_PARSE;
1311    }
1312
1313    *hostname = strdup(address);
1314    if (NULL == *hostname)
1315    {
1316       return JB_ERR_MEMORY;
1317    }
1318
1319    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1320    {
1321       *p++ = '\0';
1322       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1323       if (*p == ':')
1324       {
1325          *port = (int)strtol(++p, NULL, 0);
1326       }
1327    }
1328    else if (NULL != (p = strchr(*hostname, ':')))
1329    {
1330       *p++ = '\0';
1331       *port = (int)strtol(p, NULL, 0);
1332    }
1333
1334    return JB_ERR_OK;
1335
1336 }
1337
1338
1339 /*
1340   Local Variables:
1341   tab-width: 3
1342   end:
1343 */