urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.71 2012/06/08 15:15:11 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2011
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_MEMORY on out of memory
 119  *                JB_ERR_PARSE on malformed command/URL
 120  *                             or >100 domains deep.
 121  *
 122  *********************************************************************/
 123 jb_err init_domain_components(struct http_request *http)
 124 {
 125    char *vec[BUFFER_SIZE];
 126    size_t size;
 127    char *p;
 128
 129    http->dbuffer = strdup(http->host);
 130    if (NULL == http->dbuffer)
 131    {
 132       return JB_ERR_MEMORY;
 133    }
 134
 135    /* map to lower case */
 136    for (p = http->dbuffer; *p ; p++)
 137    {
 138       *p = (char)privoxy_tolower(*p);
 139    }
 140
 141    /* split the domain name into components */
 142    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 143
 144    if (http->dcount <= 0)
 145    {
 146       /*
 147        * Error: More than SZ(vec) components in domain
 148        *    or: no components in domain
 149        */
 150       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 151       return JB_ERR_PARSE;
 152    }
 153
 154    /* save a copy of the pointers in dvec */
 155    size = (size_t)http->dcount * sizeof(*http->dvec);
 156
 157    http->dvec = malloc_or_die(size);
 158
 159    memcpy(http->dvec, vec, size);
 160
 161    return JB_ERR_OK;
 162 }
 163 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 164
 165
 166 /*********************************************************************
 167  *
 168  * Function    :  url_requires_percent_encoding
 169  *
 170  * Description :  Checks if an URL contains invalid characters
 171  *                according to RFC 3986 that should be percent-encoded.
 172  *                Does not verify whether or not the passed string
 173  *                actually is a valid URL.
 174  *
 175  * Parameters  :
 176  *          1  :  url = URL to check
 177  *
 178  * Returns     :  True in case of valid URLs, false otherwise
 179  *
 180  *********************************************************************/
 181 int url_requires_percent_encoding(const char *url)
 182 {
 183    static const char allowed_characters[128] = {
 184       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 185       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 186       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 187       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 188       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 189       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 190       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 191       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 192       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 193       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 194       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 195       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 196       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 197    };
 198
 199    while (*url != '\0')
 200    {
 201       const unsigned int i = (unsigned char)*url++;
 202       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 203       {
 204          return TRUE;
 205       }
 206    }
 207
 208    return FALSE;
 209
 210 }
 211
 212
 213 /*********************************************************************
 214  *
 215  * Function    :  parse_http_url
 216  *
 217  * Description :  Parse out the host and port from the URL.  Find the
 218  *                hostname & path, port (if ':'), and/or password (if '@')
 219  *
 220  * Parameters  :
 221  *          1  :  url = URL (or is it URI?) to break down
 222  *          2  :  http = pointer to the http structure to hold elements.
 223  *                       Must be initialized with valid values (like NULLs).
 224  *          3  :  require_protocol = Whether or not URLs without
 225  *                                   protocol are acceptable.
 226  *
 227  * Returns     :  JB_ERR_OK on success
 228  *                JB_ERR_MEMORY on out of memory
 229  *                JB_ERR_PARSE on malformed command/URL
 230  *                             or >100 domains deep.
 231  *
 232  *********************************************************************/
 233 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 234 {
 235    int host_available = 1; /* A proxy can dream. */
 236
 237    /*
 238     * Save our initial URL
 239     */
 240    http->url = strdup(url);
 241    if (http->url == NULL)
 242    {
 243       return JB_ERR_MEMORY;
 244    }
 245
 246
 247    /*
 248     * Check for * URI. If found, we're done.
 249     */
 250    if (*http->url == '*')
 251    {
 252       if (NULL == (http->path = strdup("*"))
 253        || NULL == (http->hostport = strdup("")))
 254       {
 255          return JB_ERR_MEMORY;
 256       }
 257       if (http->url[1] != '\0')
 258       {
 259          return JB_ERR_PARSE;
 260       }
 261       return JB_ERR_OK;
 262    }
 263
 264
 265    /*
 266     * Split URL into protocol,hostport,path.
 267     */
 268    {
 269       char *buf;
 270       char *url_noproto;
 271       char *url_path;
 272
 273       buf = strdup(url);
 274       if (buf == NULL)
 275       {
 276          return JB_ERR_MEMORY;
 277       }
 278
 279       /* Find the start of the URL in our scratch space */
 280       url_noproto = buf;
 281       if (strncmpic(url_noproto, "http://",  7) == 0)
 282       {
 283          url_noproto += 7;
 284       }
 285       else if (strncmpic(url_noproto, "https://", 8) == 0)
 286       {
 287          /*
 288           * Should only happen when called from cgi_show_url_info().
 289           */
 290          url_noproto += 8;
 291          http->ssl = 1;
 292       }
 293       else if (*url_noproto == '/')
 294       {
 295         /*
 296          * Short request line without protocol and host.
 297          * Most likely because the client's request
 298          * was intercepted and redirected into Privoxy.
 299          */
 300          http->host = NULL;
 301          host_available = 0;
 302       }
 303       else if (require_protocol)
 304       {
 305          freez(buf);
 306          return JB_ERR_PARSE;
 307       }
 308
 309       url_path = strchr(url_noproto, '/');
 310       if (url_path != NULL)
 311       {
 312          /*
 313           * Got a path.
 314           *
 315           * NOTE: The following line ignores the path for HTTPS URLS.
 316           * This means that you get consistent behaviour if you type a
 317           * https URL in and it's parsed by the function.  (When the
 318           * URL is actually retrieved, SSL hides the path part).
 319           */
 320          http->path = strdup(http->ssl ? "/" : url_path);
 321          *url_path = '\0';
 322          http->hostport = strdup(url_noproto);
 323       }
 324       else
 325       {
 326          /*
 327           * Repair broken HTTP requests that don't contain a path,
 328           * or CONNECT requests
 329           */
 330          http->path = strdup("/");
 331          http->hostport = strdup(url_noproto);
 332       }
 333
 334       freez(buf);
 335
 336       if ((http->path == NULL)
 337        || (http->hostport == NULL))
 338       {
 339          return JB_ERR_MEMORY;
 340       }
 341    }
 342
 343    if (!host_available)
 344    {
 345       /* Without host, there is nothing left to do here */
 346       return JB_ERR_OK;
 347    }
 348
 349    /*
 350     * Split hostport into user/password (ignored), host, port.
 351     */
 352    {
 353       char *buf;
 354       char *host;
 355       char *port;
 356
 357       buf = strdup(http->hostport);
 358       if (buf == NULL)
 359       {
 360          return JB_ERR_MEMORY;
 361       }
 362
 363       /* check if url contains username and/or password */
 364       host = strchr(buf, '@');
 365       if (host != NULL)
 366       {
 367          /* Contains username/password, skip it and the @ sign. */
 368          host++;
 369       }
 370       else
 371       {
 372          /* No username or password. */
 373          host = buf;
 374       }
 375
 376       /* Move after hostname before port number */
 377       if (*host == '[')
 378       {
 379          /* Numeric IPv6 address delimited by brackets */
 380          host++;
 381          port = strchr(host, ']');
 382
 383          if (port == NULL)
 384          {
 385             /* Missing closing bracket */
 386             freez(buf);
 387             return JB_ERR_PARSE;
 388          }
 389
 390          *port++ = '\0';
 391
 392          if (*port == '\0')
 393          {
 394             port = NULL;
 395          }
 396          else if (*port != ':')
 397          {
 398             /* Garbage after closing bracket */
 399             freez(buf);
 400             return JB_ERR_PARSE;
 401          }
 402       }
 403       else
 404       {
 405          /* Plain non-escaped hostname */
 406          port = strchr(host, ':');
 407       }
 408
 409       /* check if url contains port */
 410       if (port != NULL)
 411       {
 412          /* Contains port */
 413          /* Terminate hostname and point to start of port string */
 414          *port++ = '\0';
 415          http->port = atoi(port);
 416       }
 417       else
 418       {
 419          /* No port specified. */
 420          http->port = (http->ssl ? 443 : 80);
 421       }
 422
 423       http->host = strdup(host);
 424
 425       freez(buf);
 426
 427       if (http->host == NULL)
 428       {
 429          return JB_ERR_MEMORY;
 430       }
 431    }
 432
 433 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 434    return JB_ERR_OK;
 435 #else
 436    /* Split domain name so we can compare it against wildcards */
 437    return init_domain_components(http);
 438 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 439
 440 }
 441
 442
 443 /*********************************************************************
 444  *
 445  * Function    :  unknown_method
 446  *
 447  * Description :  Checks whether a method is unknown.
 448  *
 449  * Parameters  :
 450  *          1  :  method = points to a http method
 451  *
 452  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 453  *
 454  *********************************************************************/
 455 static int unknown_method(const char *method)
 456 {
 457    static const char * const known_http_methods[] = {
 458       /* Basic HTTP request type */
 459       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 460       /* webDAV extensions (RFC2518) */
 461       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 462       /*
 463        * Microsoft webDAV extension for Exchange 2000.  See:
 464        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 465        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 466        */
 467       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 468       /*
 469        * Another Microsoft webDAV extension for Exchange 2000.  See:
 470        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 471        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 472        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 473        */
 474       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 475       /*
 476        * Yet another WebDAV extension, this time for
 477        * Web Distributed Authoring and Versioning (RFC3253)
 478        */
 479       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 480       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 481    };
 482    int i;
 483
 484    for (i = 0; i < SZ(known_http_methods); i++)
 485    {
 486       if (0 == strcmpic(method, known_http_methods[i]))
 487       {
 488          return FALSE;
 489       }
 490    }
 491
 492    return TRUE;
 493
 494 }
 495
 496
 497 /*********************************************************************
 498  *
 499  * Function    :  parse_http_request
 500  *
 501  * Description :  Parse out the host and port from the URL.  Find the
 502  *                hostname & path, port (if ':'), and/or password (if '@')
 503  *
 504  * Parameters  :
 505  *          1  :  req = HTTP request line to break down
 506  *          2  :  http = pointer to the http structure to hold elements
 507  *
 508  * Returns     :  JB_ERR_OK on success
 509  *                JB_ERR_MEMORY on out of memory
 510  *                JB_ERR_CGI_PARAMS on malformed command/URL
 511  *                                  or >100 domains deep.
 512  *
 513  *********************************************************************/
 514 jb_err parse_http_request(const char *req, struct http_request *http)
 515 {
 516    char *buf;
 517    char *v[10]; /* XXX: Why 10? We should only need three. */
 518    int n;
 519    jb_err err;
 520
 521    memset(http, '\0', sizeof(*http));
 522
 523    buf = strdup(req);
 524    if (buf == NULL)
 525    {
 526       return JB_ERR_MEMORY;
 527    }
 528
 529    n = ssplit(buf, " \r\n", v, SZ(v));
 530    if (n != 3)
 531    {
 532       freez(buf);
 533       return JB_ERR_PARSE;
 534    }
 535
 536    /*
 537     * Fail in case of unknown methods
 538     * which we might not handle correctly.
 539     *
 540     * XXX: There should be a config option
 541     * to forward requests with unknown methods
 542     * anyway. Most of them don't need special
 543     * steps.
 544     */
 545    if (unknown_method(v[0]))
 546    {
 547       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 548       freez(buf);
 549       return JB_ERR_PARSE;
 550    }
 551
 552    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 553    {
 554       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 555          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 556       freez(buf);
 557       return JB_ERR_PARSE;
 558    }
 559
 560    http->ssl = !strcmpic(v[0], "CONNECT");
 561
 562    err = parse_http_url(v[1], http, !http->ssl);
 563    if (err)
 564    {
 565       freez(buf);
 566       return err;
 567    }
 568
 569    /*
 570     * Copy the details into the structure
 571     */
 572    http->cmd = strdup(req);
 573    http->gpc = strdup(v[0]);
 574    http->ver = strdup(v[2]);
 575
 576    freez(buf);
 577
 578    if ( (http->cmd == NULL)
 579      || (http->gpc == NULL)
 580      || (http->ver == NULL))
 581    {
 582       return JB_ERR_MEMORY;
 583    }
 584
 585    return JB_ERR_OK;
 586
 587 }
 588
 589
 590 /*********************************************************************
 591  *
 592  * Function    :  compile_pattern
 593  *
 594  * Description :  Compiles a host, domain or TAG pattern.
 595  *
 596  * Parameters  :
 597  *          1  :  pattern = The pattern to compile.
 598  *          2  :  anchoring = How the regex should be modified
 599  *                            before compilation. Can be either
 600  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 601  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 602  *          3  :  url     = In case of failures, the spec member is
 603  *                          logged and the structure freed.
 604  *          4  :  regex   = Where the compiled regex should be stored.
 605  *
 606  * Returns     :  JB_ERR_OK - Success
 607  *                JB_ERR_MEMORY - Out of memory
 608  *                JB_ERR_PARSE - Cannot parse regex
 609  *
 610  *********************************************************************/
 611 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 612                               struct url_spec *url, regex_t **regex)
 613 {
 614    int errcode;
 615    char rebuf[BUFFER_SIZE];
 616    const char *fmt = NULL;
 617
 618    assert(pattern);
 619    assert(strlen(pattern) < sizeof(rebuf) - 2);
 620
 621    if (pattern[0] == '\0')
 622    {
 623       *regex = NULL;
 624       return JB_ERR_OK;
 625    }
 626
 627    switch (anchoring)
 628    {
 629       case NO_ANCHORING:
 630          fmt = "%s";
 631          break;
 632       case RIGHT_ANCHORED:
 633          fmt = "%s$";
 634          break;
 635       case RIGHT_ANCHORED_HOST:
 636          fmt = "%s\\.?$";
 637          break;
 638       case LEFT_ANCHORED:
 639          fmt = "^%s";
 640          break;
 641       default:
 642          log_error(LOG_LEVEL_FATAL,
 643             "Invalid anchoring in compile_pattern %d", anchoring);
 644    }
 645
 646    *regex = zalloc(sizeof(**regex));
 647    if (NULL == *regex)
 648    {
 649       free_url_spec(url);
 650       return JB_ERR_MEMORY;
 651    }
 652
 653    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 654
 655    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 656
 657    if (errcode)
 658    {
 659       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 660       if (errlen > (sizeof(rebuf) - (size_t)1))
 661       {
 662          errlen = sizeof(rebuf) - (size_t)1;
 663       }
 664       rebuf[errlen] = '\0';
 665       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 666          pattern, url->spec, rebuf);
 667       free_url_spec(url);
 668
 669       return JB_ERR_PARSE;
 670    }
 671
 672    return JB_ERR_OK;
 673
 674 }
 675
 676
 677 /*********************************************************************
 678  *
 679  * Function    :  compile_url_pattern
 680  *
 681  * Description :  Compiles the three parts of an URL pattern.
 682  *
 683  * Parameters  :
 684  *          1  :  url = Target url_spec to be filled in.
 685  *          2  :  buf = The url pattern to compile. Will be messed up.
 686  *
 687  * Returns     :  JB_ERR_OK - Success
 688  *                JB_ERR_MEMORY - Out of memory
 689  *                JB_ERR_PARSE - Cannot parse regex
 690  *
 691  *********************************************************************/
 692 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 693 {
 694    char *p;
 695
 696    p = strchr(buf, '/');
 697    if (NULL != p)
 698    {
 699       /*
 700        * Only compile the regex if it consists of more than
 701        * a single slash, otherwise it wouldn't affect the result.
 702        */
 703       if (p[1] != '\0')
 704       {
 705          /*
 706           * XXX: does it make sense to compile the slash at the beginning?
 707           */
 708          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 709
 710          if (JB_ERR_OK != err)
 711          {
 712             return err;
 713          }
 714       }
 715       *p = '\0';
 716    }
 717
 718    /*
 719     * IPv6 numeric hostnames can contain colons, thus we need
 720     * to delimit the hostname before the real port separator.
 721     * As brackets are already used in the hostname pattern,
 722     * we use angle brackets ('<', '>') instead.
 723     */
 724    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 725    {
 726       *p++ = '\0';
 727       buf++;
 728
 729       if (*p == '\0')
 730       {
 731          /* IPv6 address without port number */
 732          p = NULL;
 733       }
 734       else if (*p != ':')
 735       {
 736          /* Garbage after address delimiter */
 737          return JB_ERR_PARSE;
 738       }
 739    }
 740    else
 741    {
 742       p = strchr(buf, ':');
 743    }
 744
 745    if (NULL != p)
 746    {
 747       *p++ = '\0';
 748       url->port_list = strdup(p);
 749       if (NULL == url->port_list)
 750       {
 751          return JB_ERR_MEMORY;
 752       }
 753    }
 754    else
 755    {
 756       url->port_list = NULL;
 757    }
 758
 759    if (buf[0] != '\0')
 760    {
 761       return compile_host_pattern(url, buf);
 762    }
 763
 764    return JB_ERR_OK;
 765
 766 }
 767
 768
 769 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 770 /*********************************************************************
 771  *
 772  * Function    :  compile_host_pattern
 773  *
 774  * Description :  Parses and compiles a host pattern.
 775  *
 776  * Parameters  :
 777  *          1  :  url = Target url_spec to be filled in.
 778  *          2  :  host_pattern = Host pattern to compile.
 779  *
 780  * Returns     :  JB_ERR_OK - Success
 781  *                JB_ERR_MEMORY - Out of memory
 782  *                JB_ERR_PARSE - Cannot parse regex
 783  *
 784  *********************************************************************/
 785 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 786 {
 787    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 788 }
 789
 790 #else
 791
 792 /*********************************************************************
 793  *
 794  * Function    :  compile_host_pattern
 795  *
 796  * Description :  Parses and "compiles" an old-school host pattern.
 797  *
 798  * Parameters  :
 799  *          1  :  url = Target url_spec to be filled in.
 800  *          2  :  host_pattern = Host pattern to parse.
 801  *
 802  * Returns     :  JB_ERR_OK - Success
 803  *                JB_ERR_MEMORY - Out of memory
 804  *                JB_ERR_PARSE - Cannot parse regex
 805  *
 806  *********************************************************************/
 807 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 808 {
 809    char *v[150];
 810    size_t size;
 811    char *p;
 812
 813    /*
 814     * Parse domain part
 815     */
 816    if (host_pattern[strlen(host_pattern) - 1] == '.')
 817    {
 818       url->unanchored |= ANCHOR_RIGHT;
 819    }
 820    if (host_pattern[0] == '.')
 821    {
 822       url->unanchored |= ANCHOR_LEFT;
 823    }
 824
 825    /*
 826     * Split domain into components
 827     */
 828    url->dbuffer = strdup(host_pattern);
 829    if (NULL == url->dbuffer)
 830    {
 831       free_url_spec(url);
 832       return JB_ERR_MEMORY;
 833    }
 834
 835    /*
 836     * Map to lower case
 837     */
 838    for (p = url->dbuffer; *p ; p++)
 839    {
 840       *p = (char)privoxy_tolower(*p);
 841    }
 842
 843    /*
 844     * Split the domain name into components
 845     */
 846    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v));
 847
 848    if (url->dcount < 0)
 849    {
 850       free_url_spec(url);
 851       return JB_ERR_MEMORY;
 852    }
 853    else if (url->dcount != 0)
 854    {
 855       /*
 856        * Save a copy of the pointers in dvec
 857        */
 858       size = (size_t)url->dcount * sizeof(*url->dvec);
 859
 860       url->dvec = malloc_or_die(size);
 861
 862       memcpy(url->dvec, v, size);
 863    }
 864    /*
 865     * else dcount == 0 in which case we needn't do anything,
 866     * since dvec will never be accessed and the pattern will
 867     * match all domains.
 868     */
 869    return JB_ERR_OK;
 870 }
 871
 872
 873 /*********************************************************************
 874  *
 875  * Function    :  simplematch
 876  *
 877  * Description :  String matching, with a (greedy) '*' wildcard that
 878  *                stands for zero or more arbitrary characters and
 879  *                character classes in [], which take both enumerations
 880  *                and ranges.
 881  *
 882  * Parameters  :
 883  *          1  :  pattern = pattern for matching
 884  *          2  :  text    = text to be matched
 885  *
 886  * Returns     :  0 if match, else nonzero
 887  *
 888  *********************************************************************/
 889 static int simplematch(const char *pattern, const char *text)
 890 {
 891    const unsigned char *pat = (const unsigned char *)pattern;
 892    const unsigned char *txt = (const unsigned char *)text;
 893    const unsigned char *fallback = pat;
 894    int wildcard = 0;
 895
 896    unsigned char lastchar = 'a';
 897    unsigned i;
 898    unsigned char charmap[32];
 899
 900    while (*txt)
 901    {
 902
 903       /* EOF pattern but !EOF text? */
 904       if (*pat == '\0')
 905       {
 906          if (wildcard)
 907          {
 908             pat = fallback;
 909          }
 910          else
 911          {
 912             return 1;
 913          }
 914       }
 915
 916       /* '*' in the pattern?  */
 917       if (*pat == '*')
 918       {
 919
 920          /* The pattern ends afterwards? Speed up the return. */
 921          if (*++pat == '\0')
 922          {
 923             return 0;
 924          }
 925
 926          /* Else, set wildcard mode and remember position after '*' */
 927          wildcard = 1;
 928          fallback = pat;
 929       }
 930
 931       /* Character range specification? */
 932       if (*pat == '[')
 933       {
 934          memset(charmap, '\0', sizeof(charmap));
 935
 936          while (*++pat != ']')
 937          {
 938             if (!*pat)
 939             {
 940                return 1;
 941             }
 942             else if (*pat == '-')
 943             {
 944                if ((*++pat == ']') || *pat == '\0')
 945                {
 946                   return(1);
 947                }
 948                for (i = lastchar; i <= *pat; i++)
 949                {
 950                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 951                }
 952             }
 953             else
 954             {
 955                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 956                lastchar = *pat;
 957             }
 958          }
 959       } /* -END- if Character range specification */
 960
 961
 962       /*
 963        * Char match, or char range match?
 964        */
 965       if ((*pat == *txt)
 966        || (*pat == '?')
 967        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 968       {
 969          /*
 970           * Success: Go ahead
 971           */
 972          pat++;
 973       }
 974       else if (!wildcard)
 975       {
 976          /*
 977           * No match && no wildcard: No luck
 978           */
 979          return 1;
 980       }
 981       else if (pat != fallback)
 982       {
 983          /*
 984           * Increment text pointer if in char range matching
 985           */
 986          if (*pat == ']')
 987          {
 988             txt++;
 989          }
 990          /*
 991           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 992           */
 993          pat = fallback;
 994          /*
 995           * Restart matching from current text pointer
 996           */
 997          continue;
 998       }
 999       txt++;
1000    }
1001
1002    /* Cut off extra '*'s */
1003    if (*pat == '*') pat++;
1004
1005    /* If this is the pattern's end, fine! */
1006    return(*pat);
1007
1008 }
1009
1010
1011 /*********************************************************************
1012  *
1013  * Function    :  simple_domaincmp
1014  *
1015  * Description :  Domain-wise Compare fqdn's.  The comparison is
1016  *                both left- and right-anchored.  The individual
1017  *                domain names are compared with simplematch().
1018  *                This is only used by domain_match.
1019  *
1020  * Parameters  :
1021  *          1  :  pv = array of patterns to compare
1022  *          2  :  fv = array of domain components to compare
1023  *          3  :  len = length of the arrays (both arrays are the
1024  *                      same length - if they weren't, it couldn't
1025  *                      possibly be a match).
1026  *
1027  * Returns     :  0 => domains are equivalent, else no match.
1028  *
1029  *********************************************************************/
1030 static int simple_domaincmp(char **pv, char **fv, int len)
1031 {
1032    int n;
1033
1034    for (n = 0; n < len; n++)
1035    {
1036       if (simplematch(pv[n], fv[n]))
1037       {
1038          return 1;
1039       }
1040    }
1041
1042    return 0;
1043
1044 }
1045
1046
1047 /*********************************************************************
1048  *
1049  * Function    :  domain_match
1050  *
1051  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1052  *                pattern->unachored, the comparison is un-, left-,
1053  *                right-anchored, or both.
1054  *                The individual domain names are compared with
1055  *                simplematch().
1056  *
1057  * Parameters  :
1058  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1059  *          2  :  fqdn = domain name against which the patterns are compared.
1060  *
1061  * Returns     :  0 => domains are equivalent, else no match.
1062  *
1063  *********************************************************************/
1064 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1065 {
1066    char **pv, **fv;  /* vectors  */
1067    int    plen, flen;
1068    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1069
1070    plen = pattern->dcount;
1071    flen = fqdn->dcount;
1072
1073    if (flen < plen)
1074    {
1075       /* fqdn is too short to match this pattern */
1076       return 1;
1077    }
1078
1079    pv   = pattern->dvec;
1080    fv   = fqdn->dvec;
1081
1082    if (unanchored == ANCHOR_LEFT)
1083    {
1084       /*
1085        * Right anchored.
1086        *
1087        * Convert this into a fully anchored pattern with
1088        * the fqdn and pattern the same length
1089        */
1090       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1091       return simple_domaincmp(pv, fv, plen);
1092    }
1093    else if (unanchored == 0)
1094    {
1095       /* Fully anchored, check length */
1096       if (flen != plen)
1097       {
1098          return 1;
1099       }
1100       return simple_domaincmp(pv, fv, plen);
1101    }
1102    else if (unanchored == ANCHOR_RIGHT)
1103    {
1104       /* Left anchored, ignore all extra in fqdn */
1105       return simple_domaincmp(pv, fv, plen);
1106    }
1107    else
1108    {
1109       /* Unanchored */
1110       int n;
1111       int maxn = flen - plen;
1112       for (n = 0; n <= maxn; n++)
1113       {
1114          if (!simple_domaincmp(pv, fv, plen))
1115          {
1116             return 0;
1117          }
1118          /*
1119           * Doesn't match from start of fqdn
1120           * Try skipping first part of fqdn
1121           */
1122          fv++;
1123       }
1124       return 1;
1125    }
1126
1127 }
1128 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1129
1130
1131 /*********************************************************************
1132  *
1133  * Function    :  create_url_spec
1134  *
1135  * Description :  Creates a "url_spec" structure from a string.
1136  *                When finished, free with free_url_spec().
1137  *
1138  * Parameters  :
1139  *          1  :  url = Target url_spec to be filled in.  Will be
1140  *                      zeroed before use.
1141  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1142  *                      contents of this buffer are destroyed by this
1143  *                      function.  If this function succeeds, the
1144  *                      buffer is copied to url->spec.  If this
1145  *                      function fails, the contents of the buffer
1146  *                      are lost forever.
1147  *
1148  * Returns     :  JB_ERR_OK - Success
1149  *                JB_ERR_MEMORY - Out of memory
1150  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1151  *                               written to system log)
1152  *
1153  *********************************************************************/
1154 jb_err create_url_spec(struct url_spec *url, char *buf)
1155 {
1156    assert(url);
1157    assert(buf);
1158
1159    memset(url, '\0', sizeof(*url));
1160
1161    /* Remember the original specification for the CGI pages. */
1162    url->spec = strdup(buf);
1163    if (NULL == url->spec)
1164    {
1165       return JB_ERR_MEMORY;
1166    }
1167
1168    /* Is it a tag pattern? */
1169    if (0 == strncmpic(url->spec, "TAG:", 4))
1170    {
1171       /* The pattern starts with the first character after "TAG:" */
1172       const char *tag_pattern = buf + 4;
1173       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1174    }
1175
1176    /* If it isn't a tag pattern it must be an URL pattern. */
1177    return compile_url_pattern(url, buf);
1178 }
1179
1180
1181 /*********************************************************************
1182  *
1183  * Function    :  free_url_spec
1184  *
1185  * Description :  Called from the "unloaders".  Freez the url
1186  *                structure elements.
1187  *
1188  * Parameters  :
1189  *          1  :  url = pointer to a url_spec structure.
1190  *
1191  * Returns     :  N/A
1192  *
1193  *********************************************************************/
1194 void free_url_spec(struct url_spec *url)
1195 {
1196    if (url == NULL) return;
1197
1198    freez(url->spec);
1199 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1200    if (url->host_regex)
1201    {
1202       regfree(url->host_regex);
1203       freez(url->host_regex);
1204    }
1205 #else
1206    freez(url->dbuffer);
1207    freez(url->dvec);
1208    url->dcount = 0;
1209 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1210    freez(url->port_list);
1211    if (url->preg)
1212    {
1213       regfree(url->preg);
1214       freez(url->preg);
1215    }
1216    if (url->tag_regex)
1217    {
1218       regfree(url->tag_regex);
1219       freez(url->tag_regex);
1220    }
1221 }
1222
1223
1224 /*********************************************************************
1225  *
1226  * Function    :  port_matches
1227  *
1228  * Description :  Compares a port against a port list.
1229  *
1230  * Parameters  :
1231  *          1  :  port      = The port to check.
1232  *          2  :  port_list = The list of port to compare with.
1233  *
1234  * Returns     :  TRUE for yes, FALSE otherwise.
1235  *
1236  *********************************************************************/
1237 static int port_matches(const int port, const char *port_list)
1238 {
1239    return ((NULL == port_list) || match_portlist(port_list, port));
1240 }
1241
1242
1243 /*********************************************************************
1244  *
1245  * Function    :  host_matches
1246  *
1247  * Description :  Compares a host against a host pattern.
1248  *
1249  * Parameters  :
1250  *          1  :  url = The URL to match
1251  *          2  :  pattern = The URL pattern
1252  *
1253  * Returns     :  TRUE for yes, FALSE otherwise.
1254  *
1255  *********************************************************************/
1256 static int host_matches(const struct http_request *http,
1257                         const struct url_spec *pattern)
1258 {
1259 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1260    return ((NULL == pattern->host_regex)
1261       || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
1262 #else
1263    return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
1264 #endif
1265 }
1266
1267
1268 /*********************************************************************
1269  *
1270  * Function    :  path_matches
1271  *
1272  * Description :  Compares a path against a path pattern.
1273  *
1274  * Parameters  :
1275  *          1  :  path = The path to match
1276  *          2  :  pattern = The URL pattern
1277  *
1278  * Returns     :  TRUE for yes, FALSE otherwise.
1279  *
1280  *********************************************************************/
1281 static int path_matches(const char *path, const struct url_spec *pattern)
1282 {
1283    return ((NULL == pattern->preg)
1284       || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
1285 }
1286
1287
1288 /*********************************************************************
1289  *
1290  * Function    :  url_match
1291  *
1292  * Description :  Compare a URL against a URL pattern.
1293  *
1294  * Parameters  :
1295  *          1  :  pattern = a URL pattern
1296  *          2  :  url = URL to match
1297  *
1298  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1299  *
1300  *********************************************************************/
1301 int url_match(const struct url_spec *pattern,
1302               const struct http_request *http)
1303 {
1304    if (pattern->tag_regex != NULL)
1305    {
1306       /* It's a tag pattern and shouldn't be matched against URLs */
1307       return 0;
1308    }
1309
1310    return (port_matches(http->port, pattern->port_list)
1311       && host_matches(http, pattern) && path_matches(http->path, pattern));
1312
1313 }
1314
1315
1316 /*********************************************************************
1317  *
1318  * Function    :  match_portlist
1319  *
1320  * Description :  Check if a given number is covered by a comma
1321  *                separated list of numbers and ranges (a,b-c,d,..)
1322  *
1323  * Parameters  :
1324  *          1  :  portlist = String with list
1325  *          2  :  port = port to check
1326  *
1327  * Returns     :  0 => no match
1328  *                1 => match
1329  *
1330  *********************************************************************/
1331 int match_portlist(const char *portlist, int port)
1332 {
1333    char *min, *max, *next, *portlist_copy;
1334
1335    min = portlist_copy = strdup(portlist);
1336
1337    /*
1338     * Zero-terminate first item and remember offset for next
1339     */
1340    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1341    {
1342       *next++ = '\0';
1343    }
1344
1345    /*
1346     * Loop through all items, checking for match
1347     */
1348    while (NULL != min)
1349    {
1350       if (NULL == (max = strchr(min, (int) '-')))
1351       {
1352          /*
1353           * No dash, check for equality
1354           */
1355          if (port == atoi(min))
1356          {
1357             freez(portlist_copy);
1358             return(1);
1359          }
1360       }
1361       else
1362       {
1363          /*
1364           * This is a range, so check if between min and max,
1365           * or, if max was omitted, between min and 65K
1366           */
1367          *max++ = '\0';
1368          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1369          {
1370             freez(portlist_copy);
1371             return(1);
1372          }
1373
1374       }
1375
1376       /*
1377        * Jump to next item
1378        */
1379       min = next;
1380
1381       /*
1382        * Zero-terminate next item and remember offset for n+1
1383        */
1384       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1385       {
1386          *next++ = '\0';
1387       }
1388    }
1389
1390    freez(portlist_copy);
1391    return 0;
1392
1393 }
1394
1395
1396 /*********************************************************************
1397  *
1398  * Function    :  parse_forwarder_address
1399  *
1400  * Description :  Parse out the host and port from a forwarder address.
1401  *
1402  * Parameters  :
1403  *          1  :  address = The forwarder address to parse.
1404  *          2  :  hostname = Used to return the hostname. NULL on error.
1405  *          3  :  port = Used to return the port. Untouched if no port
1406  *                       is specified.
1407  *
1408  * Returns     :  JB_ERR_OK on success
1409  *                JB_ERR_MEMORY on out of memory
1410  *                JB_ERR_PARSE on malformed address.
1411  *
1412  *********************************************************************/
1413 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1414 {
1415    char *p = address;
1416
1417    if ((*address == '[') && (NULL == strchr(address, ']')))
1418    {
1419       /* XXX: Should do some more validity checks here. */
1420       return JB_ERR_PARSE;
1421    }
1422
1423    *hostname = strdup(address);
1424    if (NULL == *hostname)
1425    {
1426       return JB_ERR_MEMORY;
1427    }
1428
1429    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1430    {
1431       *p++ = '\0';
1432       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1433       if (*p == ':')
1434       {
1435          *port = (int)strtol(++p, NULL, 0);
1436       }
1437    }
1438    else if (NULL != (p = strchr(*hostname, ':')))
1439    {
1440       *p++ = '\0';
1441       *port = (int)strtol(p, NULL, 0);
1442    }
1443
1444    return JB_ERR_OK;
1445
1446 }
1447
1448
1449 /*
1450   Local Variables:
1451   tab-width: 3
1452   end:
1453 */