urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.88 2016/03/17 10:40:53 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2014
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457       /*
 458        * The PATCH method is defined by RFC5789, the format of the
 459        * actual patch in the body depends on the application, but from
 460        * Privoxy's point of view it doesn't matter.
 461        */
 462       "PATCH",
 463    };
 464    int i;
 465
 466    for (i = 0; i < SZ(known_http_methods); i++)
 467    {
 468       if (0 == strcmpic(method, known_http_methods[i]))
 469       {
 470          return FALSE;
 471       }
 472    }
 473
 474    return TRUE;
 475
 476 }
 477
 478
 479 /*********************************************************************
 480  *
 481  * Function    :  normalize_http_version
 482  *
 483  * Description :  Take a supported HTTP version string and remove
 484  *                leading zeroes etc., reject unsupported versions.
 485  *
 486  *                This is an explicit RFC 2616 (3.1) MUST and
 487  *                RFC 7230 mandates that intermediaries send their
 488  *                own HTTP-version in forwarded messages.
 489  *
 490  * Parameters  :
 491  *          1  :  http_version = HTTP version string
 492  *
 493  * Returns     :  JB_ERR_OK on success
 494  *                JB_ERR_PARSE if the HTTP version is unsupported
 495  *
 496  *********************************************************************/
 497 jb_err static normalize_http_version(char *http_version)
 498 {
 499    unsigned int major_version;
 500    unsigned int minor_version;
 501
 502    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 503    {
 504       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 505       return JB_ERR_PARSE;
 506    }
 507
 508    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 509    {
 510       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 511          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 512       return JB_ERR_PARSE;
 513    }
 514
 515    assert(strlen(http_version) >= 8);
 516    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 517
 518    return JB_ERR_OK;
 519
 520 }
 521
 522
 523 /*********************************************************************
 524  *
 525  * Function    :  parse_http_request
 526  *
 527  * Description :  Parse out the host and port from the URL.  Find the
 528  *                hostname & path, port (if ':'), and/or password (if '@')
 529  *
 530  * Parameters  :
 531  *          1  :  req = HTTP request line to break down
 532  *          2  :  http = pointer to the http structure to hold elements
 533  *
 534  * Returns     :  JB_ERR_OK on success
 535  *                JB_ERR_CGI_PARAMS on malformed command/URL
 536  *                                  or >100 domains deep.
 537  *
 538  *********************************************************************/
 539 jb_err parse_http_request(const char *req, struct http_request *http)
 540 {
 541    char *buf;
 542    char *v[3];
 543    int n;
 544    jb_err err;
 545
 546    memset(http, '\0', sizeof(*http));
 547
 548    buf = strdup_or_die(req);
 549
 550    n = ssplit(buf, " \r\n", v, SZ(v));
 551    if (n != 3)
 552    {
 553       freez(buf);
 554       return JB_ERR_PARSE;
 555    }
 556
 557    /*
 558     * Fail in case of unknown methods
 559     * which we might not handle correctly.
 560     *
 561     * XXX: There should be a config option
 562     * to forward requests with unknown methods
 563     * anyway. Most of them don't need special
 564     * steps.
 565     */
 566    if (unknown_method(v[0]))
 567    {
 568       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 569       freez(buf);
 570       return JB_ERR_PARSE;
 571    }
 572
 573    if (JB_ERR_OK != normalize_http_version(v[2]))
 574    {
 575       freez(buf);
 576       return JB_ERR_PARSE;
 577    }
 578
 579    http->ssl = !strcmpic(v[0], "CONNECT");
 580
 581    err = parse_http_url(v[1], http, !http->ssl);
 582    if (err)
 583    {
 584       freez(buf);
 585       return err;
 586    }
 587
 588    /*
 589     * Copy the details into the structure
 590     */
 591    http->cmd = strdup_or_die(req);
 592    http->gpc = strdup_or_die(v[0]);
 593    http->ver = strdup_or_die(v[2]);
 594    http->ocmd = strdup_or_die(http->cmd);
 595
 596    freez(buf);
 597
 598    return JB_ERR_OK;
 599
 600 }
 601
 602
 603 /*********************************************************************
 604  *
 605  * Function    :  compile_pattern
 606  *
 607  * Description :  Compiles a host, domain or TAG pattern.
 608  *
 609  * Parameters  :
 610  *          1  :  pattern = The pattern to compile.
 611  *          2  :  anchoring = How the regex should be modified
 612  *                            before compilation. Can be either
 613  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 614  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 615  *          3  :  url     = In case of failures, the spec member is
 616  *                          logged and the structure freed.
 617  *          4  :  regex   = Where the compiled regex should be stored.
 618  *
 619  * Returns     :  JB_ERR_OK - Success
 620  *                JB_ERR_PARSE - Cannot parse regex
 621  *
 622  *********************************************************************/
 623 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 624                               struct pattern_spec *url, regex_t **regex)
 625 {
 626    int errcode;
 627    const char *fmt = NULL;
 628    char *rebuf;
 629    size_t rebuf_size;
 630
 631    assert(pattern);
 632
 633    if (pattern[0] == '\0')
 634    {
 635       *regex = NULL;
 636       return JB_ERR_OK;
 637    }
 638
 639    switch (anchoring)
 640    {
 641       case NO_ANCHORING:
 642          fmt = "%s";
 643          break;
 644       case RIGHT_ANCHORED:
 645          fmt = "%s$";
 646          break;
 647       case RIGHT_ANCHORED_HOST:
 648          fmt = "%s\\.?$";
 649          break;
 650       case LEFT_ANCHORED:
 651          fmt = "^%s";
 652          break;
 653       default:
 654          log_error(LOG_LEVEL_FATAL,
 655             "Invalid anchoring in compile_pattern %d", anchoring);
 656    }
 657    rebuf_size = strlen(pattern) + strlen(fmt);
 658    rebuf = malloc_or_die(rebuf_size);
 659    *regex = zalloc_or_die(sizeof(**regex));
 660
 661    snprintf(rebuf, rebuf_size, fmt, pattern);
 662
 663    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 664
 665    if (errcode)
 666    {
 667       size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
 668       if (errlen > (rebuf_size - (size_t)1))
 669       {
 670          errlen = rebuf_size - (size_t)1;
 671       }
 672       rebuf[errlen] = '\0';
 673       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 674          pattern, url->spec, rebuf);
 675       free_pattern_spec(url);
 676       freez(rebuf);
 677
 678       return JB_ERR_PARSE;
 679    }
 680    freez(rebuf);
 681
 682    return JB_ERR_OK;
 683
 684 }
 685
 686
 687 /*********************************************************************
 688  *
 689  * Function    :  compile_url_pattern
 690  *
 691  * Description :  Compiles the three parts of an URL pattern.
 692  *
 693  * Parameters  :
 694  *          1  :  url = Target pattern_spec to be filled in.
 695  *          2  :  buf = The url pattern to compile. Will be messed up.
 696  *
 697  * Returns     :  JB_ERR_OK - Success
 698  *                JB_ERR_MEMORY - Out of memory
 699  *                JB_ERR_PARSE - Cannot parse regex
 700  *
 701  *********************************************************************/
 702 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 703 {
 704    char *p;
 705
 706    p = strchr(buf, '/');
 707    if (NULL != p)
 708    {
 709       /*
 710        * Only compile the regex if it consists of more than
 711        * a single slash, otherwise it wouldn't affect the result.
 712        */
 713       if (p[1] != '\0')
 714       {
 715          /*
 716           * XXX: does it make sense to compile the slash at the beginning?
 717           */
 718          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 719
 720          if (JB_ERR_OK != err)
 721          {
 722             return err;
 723          }
 724       }
 725       *p = '\0';
 726    }
 727
 728    /*
 729     * IPv6 numeric hostnames can contain colons, thus we need
 730     * to delimit the hostname before the real port separator.
 731     * As brackets are already used in the hostname pattern,
 732     * we use angle brackets ('<', '>') instead.
 733     */
 734    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 735    {
 736       *p++ = '\0';
 737       buf++;
 738
 739       if (*p == '\0')
 740       {
 741          /* IPv6 address without port number */
 742          p = NULL;
 743       }
 744       else if (*p != ':')
 745       {
 746          /* Garbage after address delimiter */
 747          return JB_ERR_PARSE;
 748       }
 749    }
 750    else
 751    {
 752       p = strchr(buf, ':');
 753    }
 754
 755    if (NULL != p)
 756    {
 757       *p++ = '\0';
 758       url->pattern.url_spec.port_list = strdup_or_die(p);
 759    }
 760    else
 761    {
 762       url->pattern.url_spec.port_list = NULL;
 763    }
 764
 765    if (buf[0] != '\0')
 766    {
 767       return compile_host_pattern(url, buf);
 768    }
 769
 770    return JB_ERR_OK;
 771
 772 }
 773
 774
 775 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 776 /*********************************************************************
 777  *
 778  * Function    :  compile_host_pattern
 779  *
 780  * Description :  Parses and compiles a host pattern.
 781  *
 782  * Parameters  :
 783  *          1  :  url = Target pattern_spec to be filled in.
 784  *          2  :  host_pattern = Host pattern to compile.
 785  *
 786  * Returns     :  JB_ERR_OK - Success
 787  *                JB_ERR_MEMORY - Out of memory
 788  *                JB_ERR_PARSE - Cannot parse regex
 789  *
 790  *********************************************************************/
 791 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 792 {
 793    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 794 }
 795
 796 #else
 797
 798 /*********************************************************************
 799  *
 800  * Function    :  compile_host_pattern
 801  *
 802  * Description :  Parses and "compiles" an old-school host pattern.
 803  *
 804  * Parameters  :
 805  *          1  :  url = Target pattern_spec to be filled in.
 806  *          2  :  host_pattern = Host pattern to parse.
 807  *
 808  * Returns     :  JB_ERR_OK - Success
 809  *                JB_ERR_PARSE - Cannot parse regex
 810  *
 811  *********************************************************************/
 812 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 813 {
 814    char *v[150];
 815    size_t size;
 816    char *p;
 817
 818    /*
 819     * Parse domain part
 820     */
 821    if (host_pattern[strlen(host_pattern) - 1] == '.')
 822    {
 823       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 824    }
 825    if (host_pattern[0] == '.')
 826    {
 827       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 828    }
 829
 830    /*
 831     * Split domain into components
 832     */
 833    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 834
 835    /*
 836     * Map to lower case
 837     */
 838    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 839    {
 840       *p = (char)privoxy_tolower(*p);
 841    }
 842
 843    /*
 844     * Split the domain name into components
 845     */
 846    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 847
 848    if (url->pattern.url_spec.dcount < 0)
 849    {
 850       free_pattern_spec(url);
 851       return JB_ERR_PARSE;
 852    }
 853    else if (url->pattern.url_spec.dcount != 0)
 854    {
 855       /*
 856        * Save a copy of the pointers in dvec
 857        */
 858       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 859
 860       url->pattern.url_spec.dvec = malloc_or_die(size);
 861
 862       memcpy(url->pattern.url_spec.dvec, v, size);
 863    }
 864    /*
 865     * else dcount == 0 in which case we needn't do anything,
 866     * since dvec will never be accessed and the pattern will
 867     * match all domains.
 868     */
 869    return JB_ERR_OK;
 870 }
 871
 872
 873 /*********************************************************************
 874  *
 875  * Function    :  simplematch
 876  *
 877  * Description :  String matching, with a (greedy) '*' wildcard that
 878  *                stands for zero or more arbitrary characters and
 879  *                character classes in [], which take both enumerations
 880  *                and ranges.
 881  *
 882  * Parameters  :
 883  *          1  :  pattern = pattern for matching
 884  *          2  :  text    = text to be matched
 885  *
 886  * Returns     :  0 if match, else nonzero
 887  *
 888  *********************************************************************/
 889 static int simplematch(const char *pattern, const char *text)
 890 {
 891    const unsigned char *pat = (const unsigned char *)pattern;
 892    const unsigned char *txt = (const unsigned char *)text;
 893    const unsigned char *fallback = pat;
 894    int wildcard = 0;
 895
 896    unsigned char lastchar = 'a';
 897    unsigned i;
 898    unsigned char charmap[32];
 899
 900    while (*txt)
 901    {
 902
 903       /* EOF pattern but !EOF text? */
 904       if (*pat == '\0')
 905       {
 906          if (wildcard)
 907          {
 908             pat = fallback;
 909          }
 910          else
 911          {
 912             return 1;
 913          }
 914       }
 915
 916       /* '*' in the pattern?  */
 917       if (*pat == '*')
 918       {
 919
 920          /* The pattern ends afterwards? Speed up the return. */
 921          if (*++pat == '\0')
 922          {
 923             return 0;
 924          }
 925
 926          /* Else, set wildcard mode and remember position after '*' */
 927          wildcard = 1;
 928          fallback = pat;
 929       }
 930
 931       /* Character range specification? */
 932       if (*pat == '[')
 933       {
 934          memset(charmap, '\0', sizeof(charmap));
 935
 936          while (*++pat != ']')
 937          {
 938             if (!*pat)
 939             {
 940                return 1;
 941             }
 942             else if (*pat == '-')
 943             {
 944                if ((*++pat == ']') || *pat == '\0')
 945                {
 946                   return(1);
 947                }
 948                for (i = lastchar; i <= *pat; i++)
 949                {
 950                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 951                }
 952             }
 953             else
 954             {
 955                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 956                lastchar = *pat;
 957             }
 958          }
 959       } /* -END- if Character range specification */
 960
 961
 962       /*
 963        * Char match, or char range match?
 964        */
 965       if ((*pat == *txt)
 966        || (*pat == '?')
 967        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 968       {
 969          /*
 970           * Success: Go ahead
 971           */
 972          pat++;
 973       }
 974       else if (!wildcard)
 975       {
 976          /*
 977           * No match && no wildcard: No luck
 978           */
 979          return 1;
 980       }
 981       else if (pat != fallback)
 982       {
 983          /*
 984           * Increment text pointer if in char range matching
 985           */
 986          if (*pat == ']')
 987          {
 988             txt++;
 989          }
 990          /*
 991           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 992           */
 993          pat = fallback;
 994          /*
 995           * Restart matching from current text pointer
 996           */
 997          continue;
 998       }
 999       txt++;
1000    }
1001
1002    /* Cut off extra '*'s */
1003    if (*pat == '*') pat++;
1004
1005    /* If this is the pattern's end, fine! */
1006    return(*pat);
1007
1008 }
1009
1010
1011 /*********************************************************************
1012  *
1013  * Function    :  simple_domaincmp
1014  *
1015  * Description :  Domain-wise Compare fqdn's.  The comparison is
1016  *                both left- and right-anchored.  The individual
1017  *                domain names are compared with simplematch().
1018  *                This is only used by domain_match.
1019  *
1020  * Parameters  :
1021  *          1  :  pv = array of patterns to compare
1022  *          2  :  fv = array of domain components to compare
1023  *          3  :  len = length of the arrays (both arrays are the
1024  *                      same length - if they weren't, it couldn't
1025  *                      possibly be a match).
1026  *
1027  * Returns     :  0 => domains are equivalent, else no match.
1028  *
1029  *********************************************************************/
1030 static int simple_domaincmp(char **pv, char **fv, int len)
1031 {
1032    int n;
1033
1034    for (n = 0; n < len; n++)
1035    {
1036       if (simplematch(pv[n], fv[n]))
1037       {
1038          return 1;
1039       }
1040    }
1041
1042    return 0;
1043
1044 }
1045
1046
1047 /*********************************************************************
1048  *
1049  * Function    :  domain_match
1050  *
1051  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1052  *                p.pattern->unachored, the comparison is un-, left-,
1053  *                right-anchored, or both.
1054  *                The individual domain names are compared with
1055  *                simplematch().
1056  *
1057  * Parameters  :
1058  *          1  :  p = a domain that may contain a '*' as a wildcard.
1059  *          2  :  fqdn = domain name against which the patterns are compared.
1060  *
1061  * Returns     :  0 => domains are equivalent, else no match.
1062  *
1063  *********************************************************************/
1064 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1065 {
1066    char **pv, **fv;  /* vectors  */
1067    int    plen, flen;
1068    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1069
1070    plen = p->pattern.url_spec.dcount;
1071    flen = fqdn->dcount;
1072
1073    if (flen < plen)
1074    {
1075       /* fqdn is too short to match this pattern */
1076       return 1;
1077    }
1078
1079    pv   = p->pattern.url_spec.dvec;
1080    fv   = fqdn->dvec;
1081
1082    if (unanchored == ANCHOR_LEFT)
1083    {
1084       /*
1085        * Right anchored.
1086        *
1087        * Convert this into a fully anchored pattern with
1088        * the fqdn and pattern the same length
1089        */
1090       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1091       return simple_domaincmp(pv, fv, plen);
1092    }
1093    else if (unanchored == 0)
1094    {
1095       /* Fully anchored, check length */
1096       if (flen != plen)
1097       {
1098          return 1;
1099       }
1100       return simple_domaincmp(pv, fv, plen);
1101    }
1102    else if (unanchored == ANCHOR_RIGHT)
1103    {
1104       /* Left anchored, ignore all extra in fqdn */
1105       return simple_domaincmp(pv, fv, plen);
1106    }
1107    else
1108    {
1109       /* Unanchored */
1110       int n;
1111       int maxn = flen - plen;
1112       for (n = 0; n <= maxn; n++)
1113       {
1114          if (!simple_domaincmp(pv, fv, plen))
1115          {
1116             return 0;
1117          }
1118          /*
1119           * Doesn't match from start of fqdn
1120           * Try skipping first part of fqdn
1121           */
1122          fv++;
1123       }
1124       return 1;
1125    }
1126
1127 }
1128 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1129
1130
1131 /*********************************************************************
1132  *
1133  * Function    :  create_pattern_spec
1134  *
1135  * Description :  Creates a "pattern_spec" structure from a string.
1136  *                When finished, free with free_pattern_spec().
1137  *
1138  * Parameters  :
1139  *          1  :  pattern = Target pattern_spec to be filled in.
1140  *                          Will be zeroed before use.
1141  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1142  *                      contents of this buffer are destroyed by this
1143  *                      function.  If this function succeeds, the
1144  *                      buffer is copied to pattern->spec.  If this
1145  *                      function fails, the contents of the buffer
1146  *                      are lost forever.
1147  *
1148  * Returns     :  JB_ERR_OK - Success
1149  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1150  *                               written to system log)
1151  *
1152  *********************************************************************/
1153 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1154 {
1155    static const struct
1156    {
1157       /** The tag pattern prefix to match */
1158       const char *prefix;
1159
1160       /** The length of the prefix to match */
1161       const size_t prefix_length;
1162
1163       /** The pattern flag */
1164       const unsigned flag;
1165    } tag_pattern[] = {
1166       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1167  #ifdef FEATURE_CLIENT_TAGS
1168       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1169  #endif
1170       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1171       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1172    };
1173    int i;
1174
1175    assert(pattern);
1176    assert(buf);
1177
1178    memset(pattern, '\0', sizeof(*pattern));
1179
1180    /* Remember the original specification for the CGI pages. */
1181    pattern->spec = strdup_or_die(buf);
1182
1183    /* Check if it's a tag pattern */
1184    for (i = 0; i < SZ(tag_pattern); i++)
1185    {
1186       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1187       {
1188          /* The regex starts after the prefix */
1189          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1190
1191          pattern->flags |= tag_pattern[i].flag;
1192
1193          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1194             &pattern->pattern.tag_regex);
1195       }
1196    }
1197
1198    /* If it isn't a tag pattern it must be an URL pattern. */
1199    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1200
1201    return compile_url_pattern(pattern, buf);
1202
1203 }
1204
1205
1206 /*********************************************************************
1207  *
1208  * Function    :  free_pattern_spec
1209  *
1210  * Description :  Called from the "unloaders".  Freez the pattern
1211  *                structure elements.
1212  *
1213  * Parameters  :
1214  *          1  :  pattern = pointer to a pattern_spec structure.
1215  *
1216  * Returns     :  N/A
1217  *
1218  *********************************************************************/
1219 void free_pattern_spec(struct pattern_spec *pattern)
1220 {
1221    if (pattern == NULL) return;
1222
1223    freez(pattern->spec);
1224 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1225    if (pattern->pattern.url_spec.host_regex)
1226    {
1227       regfree(pattern->pattern.url_spec.host_regex);
1228       freez(pattern->pattern.url_spec.host_regex);
1229    }
1230 #else
1231    freez(pattern->pattern.url_spec.dbuffer);
1232    freez(pattern->pattern.url_spec.dvec);
1233    pattern->pattern.url_spec.dcount = 0;
1234 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1235    freez(pattern->pattern.url_spec.port_list);
1236    if (pattern->pattern.url_spec.preg)
1237    {
1238       regfree(pattern->pattern.url_spec.preg);
1239       freez(pattern->pattern.url_spec.preg);
1240    }
1241    if (pattern->pattern.tag_regex)
1242    {
1243       regfree(pattern->pattern.tag_regex);
1244       freez(pattern->pattern.tag_regex);
1245    }
1246 }
1247
1248
1249 /*********************************************************************
1250  *
1251  * Function    :  port_matches
1252  *
1253  * Description :  Compares a port against a port list.
1254  *
1255  * Parameters  :
1256  *          1  :  port      = The port to check.
1257  *          2  :  port_list = The list of port to compare with.
1258  *
1259  * Returns     :  TRUE for yes, FALSE otherwise.
1260  *
1261  *********************************************************************/
1262 static int port_matches(const int port, const char *port_list)
1263 {
1264    return ((NULL == port_list) || match_portlist(port_list, port));
1265 }
1266
1267
1268 /*********************************************************************
1269  *
1270  * Function    :  host_matches
1271  *
1272  * Description :  Compares a host against a host pattern.
1273  *
1274  * Parameters  :
1275  *          1  :  url = The URL to match
1276  *          2  :  pattern = The URL pattern
1277  *
1278  * Returns     :  TRUE for yes, FALSE otherwise.
1279  *
1280  *********************************************************************/
1281 static int host_matches(const struct http_request *http,
1282                         const struct pattern_spec *pattern)
1283 {
1284    assert(http->host != NULL);
1285 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1286    return ((NULL == pattern->pattern.url_spec.host_regex)
1287       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1288 #else
1289    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1290 #endif
1291 }
1292
1293
1294 /*********************************************************************
1295  *
1296  * Function    :  path_matches
1297  *
1298  * Description :  Compares a path against a path pattern.
1299  *
1300  * Parameters  :
1301  *          1  :  path = The path to match
1302  *          2  :  pattern = The URL pattern
1303  *
1304  * Returns     :  TRUE for yes, FALSE otherwise.
1305  *
1306  *********************************************************************/
1307 static int path_matches(const char *path, const struct pattern_spec *pattern)
1308 {
1309    return ((NULL == pattern->pattern.url_spec.preg)
1310       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1311 }
1312
1313
1314 /*********************************************************************
1315  *
1316  * Function    :  url_match
1317  *
1318  * Description :  Compare a URL against a URL pattern.
1319  *
1320  * Parameters  :
1321  *          1  :  pattern = a URL pattern
1322  *          2  :  url = URL to match
1323  *
1324  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1325  *
1326  *********************************************************************/
1327 int url_match(const struct pattern_spec *pattern,
1328               const struct http_request *http)
1329 {
1330    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1331    {
1332       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1333       return 0;
1334    }
1335
1336    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1337       && host_matches(http, pattern) && path_matches(http->path, pattern));
1338
1339 }
1340
1341
1342 /*********************************************************************
1343  *
1344  * Function    :  match_portlist
1345  *
1346  * Description :  Check if a given number is covered by a comma
1347  *                separated list of numbers and ranges (a,b-c,d,..)
1348  *
1349  * Parameters  :
1350  *          1  :  portlist = String with list
1351  *          2  :  port = port to check
1352  *
1353  * Returns     :  0 => no match
1354  *                1 => match
1355  *
1356  *********************************************************************/
1357 int match_portlist(const char *portlist, int port)
1358 {
1359    char *min, *max, *next, *portlist_copy;
1360
1361    min = portlist_copy = strdup_or_die(portlist);
1362
1363    /*
1364     * Zero-terminate first item and remember offset for next
1365     */
1366    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1367    {
1368       *next++ = '\0';
1369    }
1370
1371    /*
1372     * Loop through all items, checking for match
1373     */
1374    while (NULL != min)
1375    {
1376       if (NULL == (max = strchr(min, (int) '-')))
1377       {
1378          /*
1379           * No dash, check for equality
1380           */
1381          if (port == atoi(min))
1382          {
1383             freez(portlist_copy);
1384             return(1);
1385          }
1386       }
1387       else
1388       {
1389          /*
1390           * This is a range, so check if between min and max,
1391           * or, if max was omitted, between min and 65K
1392           */
1393          *max++ = '\0';
1394          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1395          {
1396             freez(portlist_copy);
1397             return(1);
1398          }
1399
1400       }
1401
1402       /*
1403        * Jump to next item
1404        */
1405       min = next;
1406
1407       /*
1408        * Zero-terminate next item and remember offset for n+1
1409        */
1410       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1411       {
1412          *next++ = '\0';
1413       }
1414    }
1415
1416    freez(portlist_copy);
1417    return 0;
1418
1419 }
1420
1421
1422 /*********************************************************************
1423  *
1424  * Function    :  parse_forwarder_address
1425  *
1426  * Description :  Parse out the host and port from a forwarder address.
1427  *
1428  * Parameters  :
1429  *          1  :  address = The forwarder address to parse.
1430  *          2  :  hostname = Used to return the hostname. NULL on error.
1431  *          3  :  port = Used to return the port. Untouched if no port
1432  *                       is specified.
1433  *
1434  * Returns     :  JB_ERR_OK on success
1435  *                JB_ERR_MEMORY on out of memory
1436  *                JB_ERR_PARSE on malformed address.
1437  *
1438  *********************************************************************/
1439 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1440 {
1441    char *p = address;
1442
1443    if ((*address == '[') && (NULL == strchr(address, ']')))
1444    {
1445       /* XXX: Should do some more validity checks here. */
1446       return JB_ERR_PARSE;
1447    }
1448
1449    *hostname = strdup_or_die(address);
1450
1451    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1452    {
1453       *p++ = '\0';
1454       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1455       if (*p == ':')
1456       {
1457          *port = (int)strtol(++p, NULL, 0);
1458       }
1459    }
1460    else if (NULL != (p = strchr(*hostname, ':')))
1461    {
1462       *p++ = '\0';
1463       *port = (int)strtol(p, NULL, 0);
1464    }
1465
1466    return JB_ERR_OK;
1467
1468 }
1469
1470
1471 /*
1472   Local Variables:
1473   tab-width: 3
1474   end:
1475 */