urlmatch.c

   1 /*********************************************************************
   2  *
   3  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   4  *
   5  * Purpose     :  Declares functions to match URLs against URL
   6  *                patterns.
   7  *
   8  * Copyright   :  Written by and Copyright (C) 2001-2020
   9  *                the Privoxy team. https://www.privoxy.org/
  10  *
  11  *                Based on the Internet Junkbuster originally written
  12  *                by and Copyright (C) 1997 Anonymous Coders and
  13  *                Junkbusters Corporation.  http://www.junkbusters.com
  14  *
  15  *                This program is free software; you can redistribute it
  16  *                and/or modify it under the terms of the GNU General
  17  *                Public License as published by the Free Software
  18  *                Foundation; either version 2 of the License, or (at
  19  *                your option) any later version.
  20  *
  21  *                This program is distributed in the hope that it will
  22  *                be useful, but WITHOUT ANY WARRANTY; without even the
  23  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  24  *                PARTICULAR PURPOSE.  See the GNU General Public
  25  *                License for more details.
  26  *
  27  *                The GNU General Public License should be included with
  28  *                this file.  If not, you can view it at
  29  *                http://www.gnu.org/copyleft/gpl.html
  30  *                or write to the Free Software Foundation, Inc., 59
  31  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32  *
  33  *********************************************************************/
  34
  35
  36 #include "config.h"
  37
  38 #ifndef _WIN32
  39 #include <stdio.h>
  40 #include <sys/types.h>
  41 #endif
  42
  43 #include <stdlib.h>
  44 #include <ctype.h>
  45 #include <assert.h>
  46 #include <string.h>
  47
  48 #if !defined(_WIN32)
  49 #include <unistd.h>
  50 #endif
  51
  52 #include "project.h"
  53 #include "urlmatch.h"
  54 #include "ssplit.h"
  55 #include "miscutil.h"
  56 #include "errlog.h"
  57
  58 enum regex_anchoring
  59 {
  60    NO_ANCHORING,
  61    LEFT_ANCHORED,
  62    RIGHT_ANCHORED,
  63    RIGHT_ANCHORED_HOST
  64 };
  65 static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern);
  66 #ifdef FEATURE_PCRE_HOST_PATTERNS
  67 static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern);
  68 #endif
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->version);
  94    freez(http->host_ip_addr_str);
  95    freez(http->dbuffer);
  96    freez(http->dvec);
  97    http->dcount = 0;
  98 }
  99
 100
 101 /*********************************************************************
 102  *
 103  * Function    :  init_domain_components
 104  *
 105  * Description :  Splits the domain name so we can compare it
 106  *                against wildcards. It used to be part of
 107  *                parse_http_url, but was separated because the
 108  *                same code is required in chat in case of
 109  *                intercepted requests.
 110  *
 111  * Parameters  :
 112  *          1  :  http = pointer to the http structure to hold elements.
 113  *
 114  * Returns     :  JB_ERR_OK on success
 115  *                JB_ERR_PARSE on malformed command/URL
 116  *                             or >100 domains deep.
 117  *
 118  *********************************************************************/
 119 jb_err init_domain_components(struct http_request *http)
 120 {
 121    char *vec[BUFFER_SIZE];
 122    size_t size;
 123    char *p;
 124
 125    http->dbuffer = strdup_or_die(http->host);
 126
 127    /* map to lower case */
 128    for (p = http->dbuffer; *p ; p++)
 129    {
 130       *p = (char)privoxy_tolower(*p);
 131    }
 132
 133    /* split the domain name into components */
 134    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 135
 136    if (http->dcount <= 0)
 137    {
 138       /*
 139        * Error: More than SZ(vec) components in domain
 140        *    or: no components in domain
 141        */
 142       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 143       return JB_ERR_PARSE;
 144    }
 145
 146    /* save a copy of the pointers in dvec */
 147    size = (size_t)http->dcount * sizeof(*http->dvec);
 148
 149    http->dvec = malloc_or_die(size);
 150
 151    memcpy(http->dvec, vec, size);
 152
 153    return JB_ERR_OK;
 154 }
 155
 156
 157 /*********************************************************************
 158  *
 159  * Function    :  url_requires_percent_encoding
 160  *
 161  * Description :  Checks if an URL contains invalid characters
 162  *                according to RFC 3986 that should be percent-encoded.
 163  *                Does not verify whether or not the passed string
 164  *                actually is a valid URL.
 165  *
 166  * Parameters  :
 167  *          1  :  url = URL to check
 168  *
 169  * Returns     :  True in case of valid URLs, false otherwise
 170  *
 171  *********************************************************************/
 172 int url_requires_percent_encoding(const char *url)
 173 {
 174    static const char allowed_characters[128] = {
 175       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 176       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 177       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 178       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 179       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 180       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 181       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 182       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 183       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 184       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 185       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 186       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 187       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 188    };
 189
 190    while (*url != '\0')
 191    {
 192       const unsigned int i = (unsigned char)*url++;
 193       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 194       {
 195          return TRUE;
 196       }
 197    }
 198
 199    return FALSE;
 200
 201 }
 202
 203
 204 /*********************************************************************
 205  *
 206  * Function    :  parse_http_url
 207  *
 208  * Description :  Parse out the host and port from the URL.  Find the
 209  *                hostname & path, port (if ':'), and/or password (if '@')
 210  *
 211  * Parameters  :
 212  *          1  :  url = URL (or is it URI?) to break down
 213  *          2  :  http = pointer to the http structure to hold elements.
 214  *                       Must be initialized with valid values (like NULLs).
 215  *          3  :  require_protocol = Whether or not URLs without
 216  *                                   protocol are acceptable.
 217  *
 218  * Returns     :  JB_ERR_OK on success
 219  *                JB_ERR_PARSE on malformed command/URL
 220  *                             or >100 domains deep.
 221  *
 222  *********************************************************************/
 223 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 224 {
 225    int host_available = 1; /* A proxy can dream. */
 226
 227    /*
 228     * Save our initial URL
 229     */
 230    http->url = strdup_or_die(url);
 231
 232    /*
 233     * Check for * URI. If found, we're done.
 234     */
 235    if (*http->url == '*')
 236    {
 237       http->path = strdup_or_die("*");
 238       http->hostport = strdup_or_die("");
 239       if (http->url[1] != '\0')
 240       {
 241          return JB_ERR_PARSE;
 242       }
 243       return JB_ERR_OK;
 244    }
 245
 246
 247    /*
 248     * Split URL into protocol,hostport,path.
 249     */
 250    {
 251       char *buf;
 252       char *url_noproto;
 253       char *url_path;
 254
 255       buf = strdup_or_die(url);
 256
 257       /* Find the start of the URL in our scratch space */
 258       url_noproto = buf;
 259       if (strncmpic(url_noproto, "http://",  7) == 0)
 260       {
 261          url_noproto += 7;
 262       }
 263       else if (strncmpic(url_noproto, "https://", 8) == 0)
 264       {
 265          /*
 266           * Should only happen when called from cgi_show_url_info().
 267           */
 268          url_noproto += 8;
 269          http->ssl = 1;
 270       }
 271       else if (*url_noproto == '/')
 272       {
 273         /*
 274          * Short request line without protocol and host.
 275          * Most likely because the client's request
 276          * was intercepted and redirected into Privoxy.
 277          */
 278          http->host = NULL;
 279          host_available = 0;
 280       }
 281       else if (require_protocol)
 282       {
 283          freez(buf);
 284          return JB_ERR_PARSE;
 285       }
 286
 287       url_path = strchr(url_noproto, '/');
 288       if (url_path != NULL)
 289       {
 290          /*
 291           * Got a path.
 292           *
 293           * If FEATURE_HTTPS_INSPECTION isn't available, ignore the
 294           * path for https URLs so that we get consistent behaviour
 295           * if a https URL is parsed. When the URL is actually
 296           * retrieved, https hides the path part.
 297           */
 298          http->path = strdup_or_die(
 299 #ifndef FEATURE_HTTPS_INSPECTION
 300             http->ssl ? "/" :
 301 #endif
 302             url_path
 303          );
 304          *url_path = '\0';
 305          http->hostport = strdup_or_die(url_noproto);
 306       }
 307       else
 308       {
 309          /*
 310           * Repair broken HTTP requests that don't contain a path,
 311           * or CONNECT requests
 312           */
 313          http->path = strdup_or_die("/");
 314          http->hostport = strdup_or_die(url_noproto);
 315       }
 316
 317       freez(buf);
 318    }
 319
 320    if (!host_available)
 321    {
 322       /* Without host, there is nothing left to do here */
 323       return JB_ERR_OK;
 324    }
 325
 326    /*
 327     * Split hostport into user/password (ignored), host, port.
 328     */
 329    {
 330       char *buf;
 331       char *host;
 332       char *port;
 333
 334       buf = strdup_or_die(http->hostport);
 335
 336       /* check if url contains username and/or password */
 337       host = strchr(buf, '@');
 338       if (host != NULL)
 339       {
 340          /* Contains username/password, skip it and the @ sign. */
 341          host++;
 342       }
 343       else
 344       {
 345          /* No username or password. */
 346          host = buf;
 347       }
 348
 349       /* Move after hostname before port number */
 350       if (*host == '[')
 351       {
 352          /* Numeric IPv6 address delimited by brackets */
 353          host++;
 354          port = strchr(host, ']');
 355
 356          if (port == NULL)
 357          {
 358             /* Missing closing bracket */
 359             freez(buf);
 360             return JB_ERR_PARSE;
 361          }
 362
 363          *port++ = '\0';
 364
 365          if (*port == '\0')
 366          {
 367             port = NULL;
 368          }
 369          else if (*port != ':')
 370          {
 371             /* Garbage after closing bracket */
 372             freez(buf);
 373             return JB_ERR_PARSE;
 374          }
 375       }
 376       else
 377       {
 378          /* Plain non-escaped hostname */
 379          port = strchr(host, ':');
 380       }
 381
 382       /* check if url contains port */
 383       if (port != NULL)
 384       {
 385          /* Contains port */
 386          char *endptr;
 387          long parsed_port;
 388          /* Terminate hostname and point to start of port string */
 389          *port++ = '\0';
 390          parsed_port = strtol(port, &endptr, 10);
 391          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 392          {
 393             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 394             freez(buf);
 395             return JB_ERR_PARSE;
 396          }
 397          http->port = (int)parsed_port;
 398       }
 399       else
 400       {
 401          /* No port specified. */
 402          http->port = (http->ssl ? 443 : 80);
 403       }
 404
 405       http->host = strdup_or_die(host);
 406
 407       freez(buf);
 408    }
 409
 410    /* Split domain name so we can compare it against wildcards */
 411    return init_domain_components(http);
 412
 413 }
 414
 415
 416 /*********************************************************************
 417  *
 418  * Function    :  unknown_method
 419  *
 420  * Description :  Checks whether a method is unknown.
 421  *
 422  * Parameters  :
 423  *          1  :  method = points to a http method
 424  *
 425  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 426  *
 427  *********************************************************************/
 428 static int unknown_method(const char *method)
 429 {
 430    static const char * const known_http_methods[] = {
 431       /* Basic HTTP request type */
 432       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 433       /* webDAV extensions (RFC2518) */
 434       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 435       /*
 436        * Microsoft webDAV extension for Exchange 2000.  See:
 437        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 438        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 439        */
 440       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 441       /*
 442        * Another Microsoft webDAV extension for Exchange 2000.  See:
 443        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 444        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 445        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 446        */
 447       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 448       /*
 449        * Yet another WebDAV extension, this time for
 450        * Web Distributed Authoring and Versioning (RFC3253)
 451        */
 452       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 453       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 454       /*
 455        * The PATCH method is defined by RFC5789, the format of the
 456        * actual patch in the body depends on the application, but from
 457        * Privoxy's point of view it doesn't matter.
 458        */
 459       "PATCH",
 460    };
 461    int i;
 462
 463    for (i = 0; i < SZ(known_http_methods); i++)
 464    {
 465       if (0 == strcmpic(method, known_http_methods[i]))
 466       {
 467          return FALSE;
 468       }
 469    }
 470
 471    return TRUE;
 472
 473 }
 474
 475
 476 /*********************************************************************
 477  *
 478  * Function    :  normalize_http_version
 479  *
 480  * Description :  Take a supported HTTP version string and remove
 481  *                leading zeroes etc., reject unsupported versions.
 482  *
 483  *                This is an explicit RFC 2616 (3.1) MUST and
 484  *                RFC 7230 mandates that intermediaries send their
 485  *                own HTTP-version in forwarded messages.
 486  *
 487  * Parameters  :
 488  *          1  :  http_version = HTTP version string
 489  *
 490  * Returns     :  JB_ERR_OK on success
 491  *                JB_ERR_PARSE if the HTTP version is unsupported
 492  *
 493  *********************************************************************/
 494 static jb_err normalize_http_version(char *http_version)
 495 {
 496    unsigned int major_version;
 497    unsigned int minor_version;
 498
 499    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 500    {
 501       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 502       return JB_ERR_PARSE;
 503    }
 504
 505    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 506    {
 507       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 508          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 509       return JB_ERR_PARSE;
 510    }
 511
 512    assert(strlen(http_version) >= 8);
 513    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 514
 515    return JB_ERR_OK;
 516
 517 }
 518
 519
 520 /*********************************************************************
 521  *
 522  * Function    :  parse_http_request
 523  *
 524  * Description :  Parse out the host and port from the URL.  Find the
 525  *                hostname & path, port (if ':'), and/or password (if '@')
 526  *
 527  * Parameters  :
 528  *          1  :  req = HTTP request line to break down
 529  *          2  :  http = pointer to the http structure to hold elements
 530  *
 531  * Returns     :  JB_ERR_OK on success
 532  *                JB_ERR_CGI_PARAMS on malformed command/URL
 533  *                                  or >100 domains deep.
 534  *
 535  *********************************************************************/
 536 jb_err parse_http_request(const char *req, struct http_request *http)
 537 {
 538    char *buf;
 539    char *v[3];
 540    int n;
 541    jb_err err;
 542
 543    memset(http, '\0', sizeof(*http));
 544
 545    buf = strdup_or_die(req);
 546
 547    n = ssplit(buf, " \r\n", v, SZ(v));
 548    if (n != 3)
 549    {
 550       freez(buf);
 551       return JB_ERR_PARSE;
 552    }
 553
 554    /*
 555     * Fail in case of unknown methods
 556     * which we might not handle correctly.
 557     *
 558     * XXX: There should be a config option
 559     * to forward requests with unknown methods
 560     * anyway. Most of them don't need special
 561     * steps.
 562     */
 563    if (unknown_method(v[0]))
 564    {
 565       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 566       freez(buf);
 567       return JB_ERR_PARSE;
 568    }
 569
 570    if (JB_ERR_OK != normalize_http_version(v[2]))
 571    {
 572       freez(buf);
 573       return JB_ERR_PARSE;
 574    }
 575
 576    http->ssl = !strcmpic(v[0], "CONNECT");
 577
 578    err = parse_http_url(v[1], http, !http->ssl);
 579    if (err)
 580    {
 581       freez(buf);
 582       return err;
 583    }
 584
 585    /*
 586     * Copy the details into the structure
 587     */
 588    http->cmd = strdup_or_die(req);
 589    http->gpc = strdup_or_die(v[0]);
 590    http->version = strdup_or_die(v[2]);
 591    http->ocmd = strdup_or_die(http->cmd);
 592
 593    freez(buf);
 594
 595    return JB_ERR_OK;
 596
 597 }
 598
 599
 600 /*********************************************************************
 601  *
 602  * Function    :  compile_pattern
 603  *
 604  * Description :  Compiles a host, domain or TAG pattern.
 605  *
 606  * Parameters  :
 607  *          1  :  pattern = The pattern to compile.
 608  *          2  :  anchoring = How the regex should be modified
 609  *                            before compilation. Can be either
 610  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 611  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 612  *          3  :  url     = In case of failures, the spec member is
 613  *                          logged and the structure freed.
 614  *          4  :  regex   = Where the compiled regex should be stored.
 615  *
 616  * Returns     :  JB_ERR_OK - Success
 617  *                JB_ERR_PARSE - Cannot parse regex
 618  *
 619  *********************************************************************/
 620 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 621                               struct pattern_spec *url, regex_t **regex)
 622 {
 623    int errcode;
 624    const char *fmt = NULL;
 625    char *rebuf;
 626    size_t rebuf_size;
 627
 628    assert(pattern);
 629
 630    if (pattern[0] == '\0')
 631    {
 632       *regex = NULL;
 633       return JB_ERR_OK;
 634    }
 635
 636    switch (anchoring)
 637    {
 638       case NO_ANCHORING:
 639          fmt = "%s";
 640          break;
 641       case RIGHT_ANCHORED:
 642          fmt = "%s$";
 643          break;
 644       case RIGHT_ANCHORED_HOST:
 645          fmt = "%s\\.?$";
 646          break;
 647       case LEFT_ANCHORED:
 648          fmt = "^%s";
 649          break;
 650       default:
 651          log_error(LOG_LEVEL_FATAL,
 652             "Invalid anchoring in compile_pattern %d", anchoring);
 653    }
 654    rebuf_size = strlen(pattern) + strlen(fmt);
 655    rebuf = malloc_or_die(rebuf_size);
 656    *regex = zalloc_or_die(sizeof(**regex));
 657
 658    snprintf(rebuf, rebuf_size, fmt, pattern);
 659
 660    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 661
 662    if (errcode)
 663    {
 664       size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
 665       if (errlen > (rebuf_size - (size_t)1))
 666       {
 667          errlen = rebuf_size - (size_t)1;
 668       }
 669       rebuf[errlen] = '\0';
 670       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 671          pattern, url->spec, rebuf);
 672       free_pattern_spec(url);
 673       freez(rebuf);
 674
 675       return JB_ERR_PARSE;
 676    }
 677    freez(rebuf);
 678
 679    return JB_ERR_OK;
 680
 681 }
 682
 683
 684 /*********************************************************************
 685  *
 686  * Function    :  compile_url_pattern
 687  *
 688  * Description :  Compiles the three parts of an URL pattern.
 689  *
 690  * Parameters  :
 691  *          1  :  url = Target pattern_spec to be filled in.
 692  *          2  :  buf = The url pattern to compile. Will be messed up.
 693  *
 694  * Returns     :  JB_ERR_OK - Success
 695  *                JB_ERR_MEMORY - Out of memory
 696  *                JB_ERR_PARSE - Cannot parse regex
 697  *
 698  *********************************************************************/
 699 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 700 {
 701    char *p;
 702
 703 #ifdef FEATURE_PCRE_HOST_PATTERNS
 704    const size_t prefix_length = 18;
 705    if (strncmpic(buf, "PCRE-HOST-PATTERN:", prefix_length) == 0)
 706    {
 707       url->pattern.url_spec.host_regex_type = PCRE_HOST_PATTERN;
 708       /* Overwrite the "PCRE-HOST-PATTERN:" prefix */
 709       memmove(buf, buf+prefix_length, strlen(buf+prefix_length)+1);
 710    }
 711    else
 712    {
 713       url->pattern.url_spec.host_regex_type = VANILLA_HOST_PATTERN;
 714    }
 715 #endif
 716
 717    p = strchr(buf, '/');
 718    if (NULL != p)
 719    {
 720       /*
 721        * Only compile the regex if it consists of more than
 722        * a single slash, otherwise it wouldn't affect the result.
 723        */
 724       if (p[1] != '\0')
 725       {
 726          /*
 727           * XXX: does it make sense to compile the slash at the beginning?
 728           */
 729          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 730
 731          if (JB_ERR_OK != err)
 732          {
 733             return err;
 734          }
 735       }
 736       *p = '\0';
 737    }
 738
 739    /*
 740     * IPv6 numeric hostnames can contain colons, thus we need
 741     * to delimit the hostname before the real port separator.
 742     * As brackets are already used in the hostname pattern,
 743     * we use angle brackets ('<', '>') instead.
 744     */
 745    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 746    {
 747       *p++ = '\0';
 748       buf++;
 749
 750       if (*p == '\0')
 751       {
 752          /* IPv6 address without port number */
 753          p = NULL;
 754       }
 755       else if (*p != ':')
 756       {
 757          /* Garbage after address delimiter */
 758          return JB_ERR_PARSE;
 759       }
 760    }
 761    else
 762    {
 763       p = strchr(buf, ':');
 764    }
 765
 766    if (NULL != p)
 767    {
 768       *p++ = '\0';
 769       url->pattern.url_spec.port_list = strdup_or_die(p);
 770    }
 771    else
 772    {
 773       url->pattern.url_spec.port_list = NULL;
 774    }
 775
 776    if (buf[0] != '\0')
 777    {
 778 #ifdef FEATURE_PCRE_HOST_PATTERNS
 779       if (url->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
 780       {
 781          return compile_pcre_host_pattern(url, buf);
 782       }
 783       else
 784 #endif
 785       {
 786          return compile_vanilla_host_pattern(url, buf);
 787       }
 788    }
 789
 790    return JB_ERR_OK;
 791
 792 }
 793
 794
 795 #ifdef FEATURE_PCRE_HOST_PATTERNS
 796 /*********************************************************************
 797  *
 798  * Function    :  compile_pcre_host_pattern
 799  *
 800  * Description :  Parses and compiles a pcre host pattern.
 801  *
 802  * Parameters  :
 803  *          1  :  url = Target pattern_spec to be filled in.
 804  *          2  :  host_pattern = Host pattern to compile.
 805  *
 806  * Returns     :  JB_ERR_OK - Success
 807  *                JB_ERR_MEMORY - Out of memory
 808  *                JB_ERR_PARSE - Cannot parse regex
 809  *
 810  *********************************************************************/
 811 static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern)
 812 {
 813    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 814 }
 815 #endif /* def FEATURE_PCRE_HOST_PATTERNS */
 816
 817
 818 /*********************************************************************
 819  *
 820  * Function    :  compile_vanilla_host_pattern
 821  *
 822  * Description :  Parses and "compiles" an old-school host pattern.
 823  *
 824  * Parameters  :
 825  *          1  :  url = Target pattern_spec to be filled in.
 826  *          2  :  host_pattern = Host pattern to parse.
 827  *
 828  * Returns     :  JB_ERR_OK - Success
 829  *                JB_ERR_PARSE - Cannot parse regex
 830  *
 831  *********************************************************************/
 832 static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern)
 833 {
 834    char *v[150];
 835    size_t size;
 836    char *p;
 837
 838    /*
 839     * Parse domain part
 840     */
 841    if (host_pattern[strlen(host_pattern) - 1] == '.')
 842    {
 843       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 844    }
 845    if (host_pattern[0] == '.')
 846    {
 847       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 848    }
 849
 850    /*
 851     * Split domain into components
 852     */
 853    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 854
 855    /*
 856     * Map to lower case
 857     */
 858    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 859    {
 860       *p = (char)privoxy_tolower(*p);
 861    }
 862
 863    /*
 864     * Split the domain name into components
 865     */
 866    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 867
 868    if (url->pattern.url_spec.dcount < 0)
 869    {
 870       free_pattern_spec(url);
 871       return JB_ERR_PARSE;
 872    }
 873    else if (url->pattern.url_spec.dcount != 0)
 874    {
 875       /*
 876        * Save a copy of the pointers in dvec
 877        */
 878       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 879
 880       url->pattern.url_spec.dvec = malloc_or_die(size);
 881
 882       memcpy(url->pattern.url_spec.dvec, v, size);
 883    }
 884    /*
 885     * else dcount == 0 in which case we needn't do anything,
 886     * since dvec will never be accessed and the pattern will
 887     * match all domains.
 888     */
 889    return JB_ERR_OK;
 890 }
 891
 892
 893 /*********************************************************************
 894  *
 895  * Function    :  simplematch
 896  *
 897  * Description :  String matching, with a (greedy) '*' wildcard that
 898  *                stands for zero or more arbitrary characters and
 899  *                character classes in [], which take both enumerations
 900  *                and ranges.
 901  *
 902  * Parameters  :
 903  *          1  :  pattern = pattern for matching
 904  *          2  :  text    = text to be matched
 905  *
 906  * Returns     :  0 if match, else nonzero
 907  *
 908  *********************************************************************/
 909 static int simplematch(const char *pattern, const char *text)
 910 {
 911    const unsigned char *pat = (const unsigned char *)pattern;
 912    const unsigned char *txt = (const unsigned char *)text;
 913    const unsigned char *fallback = pat;
 914    int wildcard = 0;
 915
 916    unsigned char lastchar = 'a';
 917    unsigned i;
 918    unsigned char charmap[32];
 919
 920    while (*txt)
 921    {
 922
 923       /* EOF pattern but !EOF text? */
 924       if (*pat == '\0')
 925       {
 926          if (wildcard)
 927          {
 928             pat = fallback;
 929          }
 930          else
 931          {
 932             return 1;
 933          }
 934       }
 935
 936       /* '*' in the pattern?  */
 937       if (*pat == '*')
 938       {
 939
 940          /* The pattern ends afterwards? Speed up the return. */
 941          if (*++pat == '\0')
 942          {
 943             return 0;
 944          }
 945
 946          /* Else, set wildcard mode and remember position after '*' */
 947          wildcard = 1;
 948          fallback = pat;
 949       }
 950
 951       /* Character range specification? */
 952       if (*pat == '[')
 953       {
 954          memset(charmap, '\0', sizeof(charmap));
 955
 956          while (*++pat != ']')
 957          {
 958             if (!*pat)
 959             {
 960                return 1;
 961             }
 962             else if (*pat == '-')
 963             {
 964                if ((*++pat == ']') || *pat == '\0')
 965                {
 966                   return(1);
 967                }
 968                for (i = lastchar; i <= *pat; i++)
 969                {
 970                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 971                }
 972             }
 973             else
 974             {
 975                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 976                lastchar = *pat;
 977             }
 978          }
 979       } /* -END- if Character range specification */
 980
 981
 982       /*
 983        * Char match, or char range match?
 984        */
 985       if ((*pat == *txt)
 986        || (*pat == '?')
 987        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 988       {
 989          /*
 990           * Success: Go ahead
 991           */
 992          pat++;
 993       }
 994       else if (!wildcard)
 995       {
 996          /*
 997           * No match && no wildcard: No luck
 998           */
 999          return 1;
1000       }
1001       else if (pat != fallback)
1002       {
1003          /*
1004           * Increment text pointer if in char range matching
1005           */
1006          if (*pat == ']')
1007          {
1008             txt++;
1009          }
1010          /*
1011           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
1012           */
1013          pat = fallback;
1014          /*
1015           * Restart matching from current text pointer
1016           */
1017          continue;
1018       }
1019       txt++;
1020    }
1021
1022    /* Cut off extra '*'s */
1023    if (*pat == '*') pat++;
1024
1025    /* If this is the pattern's end, fine! */
1026    return(*pat);
1027
1028 }
1029
1030
1031 /*********************************************************************
1032  *
1033  * Function    :  simple_domaincmp
1034  *
1035  * Description :  Domain-wise Compare fqdn's.  The comparison is
1036  *                both left- and right-anchored.  The individual
1037  *                domain names are compared with simplematch().
1038  *                This is only used by domain_match.
1039  *
1040  * Parameters  :
1041  *          1  :  pv = array of patterns to compare
1042  *          2  :  fv = array of domain components to compare
1043  *          3  :  len = length of the arrays (both arrays are the
1044  *                      same length - if they weren't, it couldn't
1045  *                      possibly be a match).
1046  *
1047  * Returns     :  0 => domains are equivalent, else no match.
1048  *
1049  *********************************************************************/
1050 static int simple_domaincmp(char **pv, char **fv, int len)
1051 {
1052    int n;
1053
1054    for (n = 0; n < len; n++)
1055    {
1056       if (simplematch(pv[n], fv[n]))
1057       {
1058          return 1;
1059       }
1060    }
1061
1062    return 0;
1063
1064 }
1065
1066
1067 /*********************************************************************
1068  *
1069  * Function    :  domain_match
1070  *
1071  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1072  *                p.pattern->unachored, the comparison is un-, left-,
1073  *                right-anchored, or both.
1074  *                The individual domain names are compared with
1075  *                simplematch().
1076  *
1077  * Parameters  :
1078  *          1  :  p = a domain that may contain a '*' as a wildcard.
1079  *          2  :  fqdn = domain name against which the patterns are compared.
1080  *
1081  * Returns     :  0 => domains are equivalent, else no match.
1082  *
1083  *********************************************************************/
1084 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1085 {
1086    char **pv, **fv;  /* vectors  */
1087    int    plen, flen;
1088    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1089
1090    plen = p->pattern.url_spec.dcount;
1091    flen = fqdn->dcount;
1092
1093    if (flen < plen)
1094    {
1095       /* fqdn is too short to match this pattern */
1096       return 1;
1097    }
1098
1099    pv   = p->pattern.url_spec.dvec;
1100    fv   = fqdn->dvec;
1101
1102    if (unanchored == ANCHOR_LEFT)
1103    {
1104       /*
1105        * Right anchored.
1106        *
1107        * Convert this into a fully anchored pattern with
1108        * the fqdn and pattern the same length
1109        */
1110       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1111       return simple_domaincmp(pv, fv, plen);
1112    }
1113    else if (unanchored == 0)
1114    {
1115       /* Fully anchored, check length */
1116       if (flen != plen)
1117       {
1118          return 1;
1119       }
1120       return simple_domaincmp(pv, fv, plen);
1121    }
1122    else if (unanchored == ANCHOR_RIGHT)
1123    {
1124       /* Left anchored, ignore all extra in fqdn */
1125       return simple_domaincmp(pv, fv, plen);
1126    }
1127    else
1128    {
1129       /* Unanchored */
1130       int n;
1131       int maxn = flen - plen;
1132       for (n = 0; n <= maxn; n++)
1133       {
1134          if (!simple_domaincmp(pv, fv, plen))
1135          {
1136             return 0;
1137          }
1138          /*
1139           * Doesn't match from start of fqdn
1140           * Try skipping first part of fqdn
1141           */
1142          fv++;
1143       }
1144       return 1;
1145    }
1146
1147 }
1148
1149
1150 /*********************************************************************
1151  *
1152  * Function    :  create_pattern_spec
1153  *
1154  * Description :  Creates a "pattern_spec" structure from a string.
1155  *                When finished, free with free_pattern_spec().
1156  *
1157  * Parameters  :
1158  *          1  :  pattern = Target pattern_spec to be filled in.
1159  *                          Will be zeroed before use.
1160  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1161  *                      contents of this buffer are destroyed by this
1162  *                      function.  If this function succeeds, the
1163  *                      buffer is copied to pattern->spec.  If this
1164  *                      function fails, the contents of the buffer
1165  *                      are lost forever.
1166  *
1167  * Returns     :  JB_ERR_OK - Success
1168  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1169  *                               written to system log)
1170  *
1171  *********************************************************************/
1172 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1173 {
1174    static const struct
1175    {
1176       /** The tag pattern prefix to match */
1177       const char *prefix;
1178
1179       /** The length of the prefix to match */
1180       const size_t prefix_length;
1181
1182       /** The pattern flag */
1183       const unsigned flag;
1184    } tag_pattern[] = {
1185       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1186  #ifdef FEATURE_CLIENT_TAGS
1187       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1188  #endif
1189       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1190       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1191    };
1192    int i;
1193
1194    assert(pattern);
1195    assert(buf);
1196
1197    memset(pattern, '\0', sizeof(*pattern));
1198
1199    /* Remember the original specification for the CGI pages. */
1200    pattern->spec = strdup_or_die(buf);
1201
1202    /* Check if it's a tag pattern */
1203    for (i = 0; i < SZ(tag_pattern); i++)
1204    {
1205       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1206       {
1207          /* The regex starts after the prefix */
1208          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1209
1210          pattern->flags |= tag_pattern[i].flag;
1211
1212          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1213             &pattern->pattern.tag_regex);
1214       }
1215    }
1216
1217    /* If it isn't a tag pattern it must be an URL pattern. */
1218    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1219
1220    return compile_url_pattern(pattern, buf);
1221
1222 }
1223
1224
1225 /*********************************************************************
1226  *
1227  * Function    :  free_pattern_spec
1228  *
1229  * Description :  Called from the "unloaders".  Freez the pattern
1230  *                structure elements.
1231  *
1232  * Parameters  :
1233  *          1  :  pattern = pointer to a pattern_spec structure.
1234  *
1235  * Returns     :  N/A
1236  *
1237  *********************************************************************/
1238 void free_pattern_spec(struct pattern_spec *pattern)
1239 {
1240    if (pattern == NULL) return;
1241
1242    freez(pattern->spec);
1243 #ifdef FEATURE_PCRE_HOST_PATTERNS
1244    if (pattern->pattern.url_spec.host_regex)
1245    {
1246       regfree(pattern->pattern.url_spec.host_regex);
1247       freez(pattern->pattern.url_spec.host_regex);
1248    }
1249 #endif /* def FEATURE_PCRE_HOST_PATTERNS */
1250    freez(pattern->pattern.url_spec.dbuffer);
1251    freez(pattern->pattern.url_spec.dvec);
1252    pattern->pattern.url_spec.dcount = 0;
1253    freez(pattern->pattern.url_spec.port_list);
1254    if (pattern->pattern.url_spec.preg)
1255    {
1256       regfree(pattern->pattern.url_spec.preg);
1257       freez(pattern->pattern.url_spec.preg);
1258    }
1259    if (pattern->pattern.tag_regex)
1260    {
1261       regfree(pattern->pattern.tag_regex);
1262       freez(pattern->pattern.tag_regex);
1263    }
1264 }
1265
1266
1267 /*********************************************************************
1268  *
1269  * Function    :  port_matches
1270  *
1271  * Description :  Compares a port against a port list.
1272  *
1273  * Parameters  :
1274  *          1  :  port      = The port to check.
1275  *          2  :  port_list = The list of port to compare with.
1276  *
1277  * Returns     :  TRUE for yes, FALSE otherwise.
1278  *
1279  *********************************************************************/
1280 static int port_matches(const int port, const char *port_list)
1281 {
1282    return ((NULL == port_list) || match_portlist(port_list, port));
1283 }
1284
1285
1286 /*********************************************************************
1287  *
1288  * Function    :  host_matches
1289  *
1290  * Description :  Compares a host against a host pattern.
1291  *
1292  * Parameters  :
1293  *          1  :  url = The URL to match
1294  *          2  :  pattern = The URL pattern
1295  *
1296  * Returns     :  TRUE for yes, FALSE otherwise.
1297  *
1298  *********************************************************************/
1299 static int host_matches(const struct http_request *http,
1300                         const struct pattern_spec *pattern)
1301 {
1302    assert(http->host != NULL);
1303 #ifdef FEATURE_PCRE_HOST_PATTERNS
1304    if (pattern->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
1305    {
1306       return ((NULL == pattern->pattern.url_spec.host_regex)
1307          || (0 == regexec(pattern->pattern.url_spec.host_regex,
1308                http->host, 0, NULL, 0)));
1309    }
1310 #endif
1311    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1312 }
1313
1314
1315 /*********************************************************************
1316  *
1317  * Function    :  path_matches
1318  *
1319  * Description :  Compares a path against a path pattern.
1320  *
1321  * Parameters  :
1322  *          1  :  path = The path to match
1323  *          2  :  pattern = The URL pattern
1324  *
1325  * Returns     :  TRUE for yes, FALSE otherwise.
1326  *
1327  *********************************************************************/
1328 static int path_matches(const char *path, const struct pattern_spec *pattern)
1329 {
1330    return ((NULL == pattern->pattern.url_spec.preg)
1331       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1332 }
1333
1334
1335 /*********************************************************************
1336  *
1337  * Function    :  url_match
1338  *
1339  * Description :  Compare a URL against a URL pattern.
1340  *
1341  * Parameters  :
1342  *          1  :  pattern = a URL pattern
1343  *          2  :  url = URL to match
1344  *
1345  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1346  *
1347  *********************************************************************/
1348 int url_match(const struct pattern_spec *pattern,
1349               const struct http_request *http)
1350 {
1351    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1352    {
1353       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1354       return 0;
1355    }
1356
1357    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1358       && host_matches(http, pattern) && path_matches(http->path, pattern));
1359
1360 }
1361
1362
1363 /*********************************************************************
1364  *
1365  * Function    :  match_portlist
1366  *
1367  * Description :  Check if a given number is covered by a comma
1368  *                separated list of numbers and ranges (a,b-c,d,..)
1369  *
1370  * Parameters  :
1371  *          1  :  portlist = String with list
1372  *          2  :  port = port to check
1373  *
1374  * Returns     :  0 => no match
1375  *                1 => match
1376  *
1377  *********************************************************************/
1378 int match_portlist(const char *portlist, int port)
1379 {
1380    char *min, *max, *next, *portlist_copy;
1381
1382    min = portlist_copy = strdup_or_die(portlist);
1383
1384    /*
1385     * Zero-terminate first item and remember offset for next
1386     */
1387    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1388    {
1389       *next++ = '\0';
1390    }
1391
1392    /*
1393     * Loop through all items, checking for match
1394     */
1395    while (NULL != min)
1396    {
1397       if (NULL == (max = strchr(min, (int) '-')))
1398       {
1399          /*
1400           * No dash, check for equality
1401           */
1402          if (port == atoi(min))
1403          {
1404             freez(portlist_copy);
1405             return(1);
1406          }
1407       }
1408       else
1409       {
1410          /*
1411           * This is a range, so check if between min and max,
1412           * or, if max was omitted, between min and 65K
1413           */
1414          *max++ = '\0';
1415          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1416          {
1417             freez(portlist_copy);
1418             return(1);
1419          }
1420
1421       }
1422
1423       /*
1424        * Jump to next item
1425        */
1426       min = next;
1427
1428       /*
1429        * Zero-terminate next item and remember offset for n+1
1430        */
1431       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1432       {
1433          *next++ = '\0';
1434       }
1435    }
1436
1437    freez(portlist_copy);
1438    return 0;
1439
1440 }
1441
1442
1443 /*********************************************************************
1444  *
1445  * Function    :  parse_forwarder_address
1446  *
1447  * Description :  Parse out the username, password, host and port from
1448  *                a forwarder address.
1449  *
1450  * Parameters  :
1451  *          1  :  address = The forwarder address to parse.
1452  *          2  :  hostname = Used to return the hostname. NULL on error.
1453  *          3  :  port = Used to return the port. Untouched if no port
1454  *                       is specified.
1455  *          4  :  username = Used to return the username if any.
1456  *          5  :  password = Used to return the password if any.
1457  *
1458  * Returns     :  JB_ERR_OK on success
1459  *                JB_ERR_MEMORY on out of memory
1460  *                JB_ERR_PARSE on malformed address.
1461  *
1462  *********************************************************************/
1463 jb_err parse_forwarder_address(char *address, char **hostname, int *port,
1464                                char **username, char **password)
1465 {
1466    char *p;
1467    char *tmp;
1468
1469    tmp = *hostname = strdup_or_die(address);
1470
1471    /* Parse username and password */
1472    if (username && password && (NULL != (p = strchr(*hostname, '@'))))
1473    {
1474       *p++ = '\0';
1475       *username = strdup_or_die(*hostname);
1476       *hostname = strdup_or_die(p);
1477
1478       if (NULL != (p = strchr(*username, ':')))
1479       {
1480          *p++ = '\0';
1481          *password = strdup_or_die(p);
1482       }
1483       freez(tmp);
1484    }
1485
1486    /* Parse hostname and port */
1487    p = *hostname;
1488    if ((*p == '[') && (NULL == strchr(p, ']')))
1489    {
1490       /* XXX: Should do some more validity checks here. */
1491       return JB_ERR_PARSE;
1492    }
1493
1494    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1495    {
1496       *p++ = '\0';
1497       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1498       if (*p == ':')
1499       {
1500          *port = (int)strtol(++p, NULL, 0);
1501       }
1502    }
1503    else if (NULL != (p = strchr(*hostname, ':')))
1504    {
1505       *p++ = '\0';
1506       *port = (int)strtol(p, NULL, 0);
1507    }
1508
1509    return JB_ERR_OK;
1510
1511 }
1512
1513
1514 /*
1515   Local Variables:
1516   tab-width: 3
1517   end:
1518 */