urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.87 2016/02/26 12:29:39 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2014
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457       /*
 458        * The PATCH method is defined by RFC5789, the format of the
 459        * actual patch in the body depends on the application, but from
 460        * Privoxy's point of view it doesn't matter.
 461        */
 462       "PATCH",
 463    };
 464    int i;
 465
 466    for (i = 0; i < SZ(known_http_methods); i++)
 467    {
 468       if (0 == strcmpic(method, known_http_methods[i]))
 469       {
 470          return FALSE;
 471       }
 472    }
 473
 474    return TRUE;
 475
 476 }
 477
 478
 479 /*********************************************************************
 480  *
 481  * Function    :  normalize_http_version
 482  *
 483  * Description :  Take a supported HTTP version string and remove
 484  *                leading zeroes etc., reject unsupported versions.
 485  *
 486  *                This is an explicit RFC 2616 (3.1) MUST and
 487  *                RFC 7230 mandates that intermediaries send their
 488  *                own HTTP-version in forwarded messages.
 489  *
 490  * Parameters  :
 491  *          1  :  http_version = HTTP version string
 492  *
 493  * Returns     :  JB_ERR_OK on success
 494  *                JB_ERR_PARSE if the HTTP version is unsupported
 495  *
 496  *********************************************************************/
 497 jb_err static normalize_http_version(char *http_version)
 498 {
 499    unsigned int major_version;
 500    unsigned int minor_version;
 501
 502    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 503    {
 504       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 505       return JB_ERR_PARSE;
 506    }
 507
 508    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 509    {
 510       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 511          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 512       return JB_ERR_PARSE;
 513    }
 514
 515    assert(strlen(http_version) >= 8);
 516    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 517
 518    return JB_ERR_OK;
 519
 520 }
 521
 522
 523 /*********************************************************************
 524  *
 525  * Function    :  parse_http_request
 526  *
 527  * Description :  Parse out the host and port from the URL.  Find the
 528  *                hostname & path, port (if ':'), and/or password (if '@')
 529  *
 530  * Parameters  :
 531  *          1  :  req = HTTP request line to break down
 532  *          2  :  http = pointer to the http structure to hold elements
 533  *
 534  * Returns     :  JB_ERR_OK on success
 535  *                JB_ERR_CGI_PARAMS on malformed command/URL
 536  *                                  or >100 domains deep.
 537  *
 538  *********************************************************************/
 539 jb_err parse_http_request(const char *req, struct http_request *http)
 540 {
 541    char *buf;
 542    char *v[3];
 543    int n;
 544    jb_err err;
 545
 546    memset(http, '\0', sizeof(*http));
 547
 548    buf = strdup_or_die(req);
 549
 550    n = ssplit(buf, " \r\n", v, SZ(v));
 551    if (n != 3)
 552    {
 553       freez(buf);
 554       return JB_ERR_PARSE;
 555    }
 556
 557    /*
 558     * Fail in case of unknown methods
 559     * which we might not handle correctly.
 560     *
 561     * XXX: There should be a config option
 562     * to forward requests with unknown methods
 563     * anyway. Most of them don't need special
 564     * steps.
 565     */
 566    if (unknown_method(v[0]))
 567    {
 568       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 569       freez(buf);
 570       return JB_ERR_PARSE;
 571    }
 572
 573    if (JB_ERR_OK != normalize_http_version(v[2]))
 574    {
 575       freez(buf);
 576       return JB_ERR_PARSE;
 577    }
 578
 579    http->ssl = !strcmpic(v[0], "CONNECT");
 580
 581    err = parse_http_url(v[1], http, !http->ssl);
 582    if (err)
 583    {
 584       freez(buf);
 585       return err;
 586    }
 587
 588    /*
 589     * Copy the details into the structure
 590     */
 591    http->cmd = strdup_or_die(req);
 592    http->gpc = strdup_or_die(v[0]);
 593    http->ver = strdup_or_die(v[2]);
 594    http->ocmd = strdup_or_die(http->cmd);
 595
 596    freez(buf);
 597
 598    return JB_ERR_OK;
 599
 600 }
 601
 602
 603 /*********************************************************************
 604  *
 605  * Function    :  compile_pattern
 606  *
 607  * Description :  Compiles a host, domain or TAG pattern.
 608  *
 609  * Parameters  :
 610  *          1  :  pattern = The pattern to compile.
 611  *          2  :  anchoring = How the regex should be modified
 612  *                            before compilation. Can be either
 613  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 614  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 615  *          3  :  url     = In case of failures, the spec member is
 616  *                          logged and the structure freed.
 617  *          4  :  regex   = Where the compiled regex should be stored.
 618  *
 619  * Returns     :  JB_ERR_OK - Success
 620  *                JB_ERR_PARSE - Cannot parse regex
 621  *
 622  *********************************************************************/
 623 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 624                               struct pattern_spec *url, regex_t **regex)
 625 {
 626    int errcode;
 627    char rebuf[BUFFER_SIZE];
 628    const char *fmt = NULL;
 629
 630    assert(pattern);
 631    assert(strlen(pattern) < sizeof(rebuf) - 2);
 632
 633    if (pattern[0] == '\0')
 634    {
 635       *regex = NULL;
 636       return JB_ERR_OK;
 637    }
 638
 639    switch (anchoring)
 640    {
 641       case NO_ANCHORING:
 642          fmt = "%s";
 643          break;
 644       case RIGHT_ANCHORED:
 645          fmt = "%s$";
 646          break;
 647       case RIGHT_ANCHORED_HOST:
 648          fmt = "%s\\.?$";
 649          break;
 650       case LEFT_ANCHORED:
 651          fmt = "^%s";
 652          break;
 653       default:
 654          log_error(LOG_LEVEL_FATAL,
 655             "Invalid anchoring in compile_pattern %d", anchoring);
 656    }
 657
 658    *regex = zalloc_or_die(sizeof(**regex));
 659
 660    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 661
 662    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 663
 664    if (errcode)
 665    {
 666       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 667       if (errlen > (sizeof(rebuf) - (size_t)1))
 668       {
 669          errlen = sizeof(rebuf) - (size_t)1;
 670       }
 671       rebuf[errlen] = '\0';
 672       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 673          pattern, url->spec, rebuf);
 674       free_pattern_spec(url);
 675
 676       return JB_ERR_PARSE;
 677    }
 678
 679    return JB_ERR_OK;
 680
 681 }
 682
 683
 684 /*********************************************************************
 685  *
 686  * Function    :  compile_url_pattern
 687  *
 688  * Description :  Compiles the three parts of an URL pattern.
 689  *
 690  * Parameters  :
 691  *          1  :  url = Target pattern_spec to be filled in.
 692  *          2  :  buf = The url pattern to compile. Will be messed up.
 693  *
 694  * Returns     :  JB_ERR_OK - Success
 695  *                JB_ERR_MEMORY - Out of memory
 696  *                JB_ERR_PARSE - Cannot parse regex
 697  *
 698  *********************************************************************/
 699 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 700 {
 701    char *p;
 702
 703    p = strchr(buf, '/');
 704    if (NULL != p)
 705    {
 706       /*
 707        * Only compile the regex if it consists of more than
 708        * a single slash, otherwise it wouldn't affect the result.
 709        */
 710       if (p[1] != '\0')
 711       {
 712          /*
 713           * XXX: does it make sense to compile the slash at the beginning?
 714           */
 715          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 716
 717          if (JB_ERR_OK != err)
 718          {
 719             return err;
 720          }
 721       }
 722       *p = '\0';
 723    }
 724
 725    /*
 726     * IPv6 numeric hostnames can contain colons, thus we need
 727     * to delimit the hostname before the real port separator.
 728     * As brackets are already used in the hostname pattern,
 729     * we use angle brackets ('<', '>') instead.
 730     */
 731    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 732    {
 733       *p++ = '\0';
 734       buf++;
 735
 736       if (*p == '\0')
 737       {
 738          /* IPv6 address without port number */
 739          p = NULL;
 740       }
 741       else if (*p != ':')
 742       {
 743          /* Garbage after address delimiter */
 744          return JB_ERR_PARSE;
 745       }
 746    }
 747    else
 748    {
 749       p = strchr(buf, ':');
 750    }
 751
 752    if (NULL != p)
 753    {
 754       *p++ = '\0';
 755       url->pattern.url_spec.port_list = strdup_or_die(p);
 756    }
 757    else
 758    {
 759       url->pattern.url_spec.port_list = NULL;
 760    }
 761
 762    if (buf[0] != '\0')
 763    {
 764       return compile_host_pattern(url, buf);
 765    }
 766
 767    return JB_ERR_OK;
 768
 769 }
 770
 771
 772 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 773 /*********************************************************************
 774  *
 775  * Function    :  compile_host_pattern
 776  *
 777  * Description :  Parses and compiles a host pattern.
 778  *
 779  * Parameters  :
 780  *          1  :  url = Target pattern_spec to be filled in.
 781  *          2  :  host_pattern = Host pattern to compile.
 782  *
 783  * Returns     :  JB_ERR_OK - Success
 784  *                JB_ERR_MEMORY - Out of memory
 785  *                JB_ERR_PARSE - Cannot parse regex
 786  *
 787  *********************************************************************/
 788 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 789 {
 790    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 791 }
 792
 793 #else
 794
 795 /*********************************************************************
 796  *
 797  * Function    :  compile_host_pattern
 798  *
 799  * Description :  Parses and "compiles" an old-school host pattern.
 800  *
 801  * Parameters  :
 802  *          1  :  url = Target pattern_spec to be filled in.
 803  *          2  :  host_pattern = Host pattern to parse.
 804  *
 805  * Returns     :  JB_ERR_OK - Success
 806  *                JB_ERR_PARSE - Cannot parse regex
 807  *
 808  *********************************************************************/
 809 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 810 {
 811    char *v[150];
 812    size_t size;
 813    char *p;
 814
 815    /*
 816     * Parse domain part
 817     */
 818    if (host_pattern[strlen(host_pattern) - 1] == '.')
 819    {
 820       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 821    }
 822    if (host_pattern[0] == '.')
 823    {
 824       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 825    }
 826
 827    /*
 828     * Split domain into components
 829     */
 830    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 831
 832    /*
 833     * Map to lower case
 834     */
 835    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 836    {
 837       *p = (char)privoxy_tolower(*p);
 838    }
 839
 840    /*
 841     * Split the domain name into components
 842     */
 843    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 844
 845    if (url->pattern.url_spec.dcount < 0)
 846    {
 847       free_pattern_spec(url);
 848       return JB_ERR_PARSE;
 849    }
 850    else if (url->pattern.url_spec.dcount != 0)
 851    {
 852       /*
 853        * Save a copy of the pointers in dvec
 854        */
 855       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 856
 857       url->pattern.url_spec.dvec = malloc_or_die(size);
 858
 859       memcpy(url->pattern.url_spec.dvec, v, size);
 860    }
 861    /*
 862     * else dcount == 0 in which case we needn't do anything,
 863     * since dvec will never be accessed and the pattern will
 864     * match all domains.
 865     */
 866    return JB_ERR_OK;
 867 }
 868
 869
 870 /*********************************************************************
 871  *
 872  * Function    :  simplematch
 873  *
 874  * Description :  String matching, with a (greedy) '*' wildcard that
 875  *                stands for zero or more arbitrary characters and
 876  *                character classes in [], which take both enumerations
 877  *                and ranges.
 878  *
 879  * Parameters  :
 880  *          1  :  pattern = pattern for matching
 881  *          2  :  text    = text to be matched
 882  *
 883  * Returns     :  0 if match, else nonzero
 884  *
 885  *********************************************************************/
 886 static int simplematch(const char *pattern, const char *text)
 887 {
 888    const unsigned char *pat = (const unsigned char *)pattern;
 889    const unsigned char *txt = (const unsigned char *)text;
 890    const unsigned char *fallback = pat;
 891    int wildcard = 0;
 892
 893    unsigned char lastchar = 'a';
 894    unsigned i;
 895    unsigned char charmap[32];
 896
 897    while (*txt)
 898    {
 899
 900       /* EOF pattern but !EOF text? */
 901       if (*pat == '\0')
 902       {
 903          if (wildcard)
 904          {
 905             pat = fallback;
 906          }
 907          else
 908          {
 909             return 1;
 910          }
 911       }
 912
 913       /* '*' in the pattern?  */
 914       if (*pat == '*')
 915       {
 916
 917          /* The pattern ends afterwards? Speed up the return. */
 918          if (*++pat == '\0')
 919          {
 920             return 0;
 921          }
 922
 923          /* Else, set wildcard mode and remember position after '*' */
 924          wildcard = 1;
 925          fallback = pat;
 926       }
 927
 928       /* Character range specification? */
 929       if (*pat == '[')
 930       {
 931          memset(charmap, '\0', sizeof(charmap));
 932
 933          while (*++pat != ']')
 934          {
 935             if (!*pat)
 936             {
 937                return 1;
 938             }
 939             else if (*pat == '-')
 940             {
 941                if ((*++pat == ']') || *pat == '\0')
 942                {
 943                   return(1);
 944                }
 945                for (i = lastchar; i <= *pat; i++)
 946                {
 947                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 948                }
 949             }
 950             else
 951             {
 952                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 953                lastchar = *pat;
 954             }
 955          }
 956       } /* -END- if Character range specification */
 957
 958
 959       /*
 960        * Char match, or char range match?
 961        */
 962       if ((*pat == *txt)
 963        || (*pat == '?')
 964        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 965       {
 966          /*
 967           * Success: Go ahead
 968           */
 969          pat++;
 970       }
 971       else if (!wildcard)
 972       {
 973          /*
 974           * No match && no wildcard: No luck
 975           */
 976          return 1;
 977       }
 978       else if (pat != fallback)
 979       {
 980          /*
 981           * Increment text pointer if in char range matching
 982           */
 983          if (*pat == ']')
 984          {
 985             txt++;
 986          }
 987          /*
 988           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 989           */
 990          pat = fallback;
 991          /*
 992           * Restart matching from current text pointer
 993           */
 994          continue;
 995       }
 996       txt++;
 997    }
 998
 999    /* Cut off extra '*'s */
1000    if (*pat == '*') pat++;
1001
1002    /* If this is the pattern's end, fine! */
1003    return(*pat);
1004
1005 }
1006
1007
1008 /*********************************************************************
1009  *
1010  * Function    :  simple_domaincmp
1011  *
1012  * Description :  Domain-wise Compare fqdn's.  The comparison is
1013  *                both left- and right-anchored.  The individual
1014  *                domain names are compared with simplematch().
1015  *                This is only used by domain_match.
1016  *
1017  * Parameters  :
1018  *          1  :  pv = array of patterns to compare
1019  *          2  :  fv = array of domain components to compare
1020  *          3  :  len = length of the arrays (both arrays are the
1021  *                      same length - if they weren't, it couldn't
1022  *                      possibly be a match).
1023  *
1024  * Returns     :  0 => domains are equivalent, else no match.
1025  *
1026  *********************************************************************/
1027 static int simple_domaincmp(char **pv, char **fv, int len)
1028 {
1029    int n;
1030
1031    for (n = 0; n < len; n++)
1032    {
1033       if (simplematch(pv[n], fv[n]))
1034       {
1035          return 1;
1036       }
1037    }
1038
1039    return 0;
1040
1041 }
1042
1043
1044 /*********************************************************************
1045  *
1046  * Function    :  domain_match
1047  *
1048  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1049  *                p.pattern->unachored, the comparison is un-, left-,
1050  *                right-anchored, or both.
1051  *                The individual domain names are compared with
1052  *                simplematch().
1053  *
1054  * Parameters  :
1055  *          1  :  p = a domain that may contain a '*' as a wildcard.
1056  *          2  :  fqdn = domain name against which the patterns are compared.
1057  *
1058  * Returns     :  0 => domains are equivalent, else no match.
1059  *
1060  *********************************************************************/
1061 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1062 {
1063    char **pv, **fv;  /* vectors  */
1064    int    plen, flen;
1065    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1066
1067    plen = p->pattern.url_spec.dcount;
1068    flen = fqdn->dcount;
1069
1070    if (flen < plen)
1071    {
1072       /* fqdn is too short to match this pattern */
1073       return 1;
1074    }
1075
1076    pv   = p->pattern.url_spec.dvec;
1077    fv   = fqdn->dvec;
1078
1079    if (unanchored == ANCHOR_LEFT)
1080    {
1081       /*
1082        * Right anchored.
1083        *
1084        * Convert this into a fully anchored pattern with
1085        * the fqdn and pattern the same length
1086        */
1087       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1088       return simple_domaincmp(pv, fv, plen);
1089    }
1090    else if (unanchored == 0)
1091    {
1092       /* Fully anchored, check length */
1093       if (flen != plen)
1094       {
1095          return 1;
1096       }
1097       return simple_domaincmp(pv, fv, plen);
1098    }
1099    else if (unanchored == ANCHOR_RIGHT)
1100    {
1101       /* Left anchored, ignore all extra in fqdn */
1102       return simple_domaincmp(pv, fv, plen);
1103    }
1104    else
1105    {
1106       /* Unanchored */
1107       int n;
1108       int maxn = flen - plen;
1109       for (n = 0; n <= maxn; n++)
1110       {
1111          if (!simple_domaincmp(pv, fv, plen))
1112          {
1113             return 0;
1114          }
1115          /*
1116           * Doesn't match from start of fqdn
1117           * Try skipping first part of fqdn
1118           */
1119          fv++;
1120       }
1121       return 1;
1122    }
1123
1124 }
1125 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1126
1127
1128 /*********************************************************************
1129  *
1130  * Function    :  create_pattern_spec
1131  *
1132  * Description :  Creates a "pattern_spec" structure from a string.
1133  *                When finished, free with free_pattern_spec().
1134  *
1135  * Parameters  :
1136  *          1  :  pattern = Target pattern_spec to be filled in.
1137  *                          Will be zeroed before use.
1138  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1139  *                      contents of this buffer are destroyed by this
1140  *                      function.  If this function succeeds, the
1141  *                      buffer is copied to pattern->spec.  If this
1142  *                      function fails, the contents of the buffer
1143  *                      are lost forever.
1144  *
1145  * Returns     :  JB_ERR_OK - Success
1146  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1147  *                               written to system log)
1148  *
1149  *********************************************************************/
1150 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1151 {
1152    static const struct
1153    {
1154       /** The tag pattern prefix to match */
1155       const char *prefix;
1156
1157       /** The length of the prefix to match */
1158       const size_t prefix_length;
1159
1160       /** The pattern flag */
1161       const unsigned flag;
1162    } tag_pattern[] = {
1163       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1164  #ifdef FEATURE_CLIENT_TAGS
1165       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1166  #endif
1167       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1168       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1169    };
1170    int i;
1171
1172    assert(pattern);
1173    assert(buf);
1174
1175    memset(pattern, '\0', sizeof(*pattern));
1176
1177    /* Remember the original specification for the CGI pages. */
1178    pattern->spec = strdup_or_die(buf);
1179
1180    /* Check if it's a tag pattern */
1181    for (i = 0; i < SZ(tag_pattern); i++)
1182    {
1183       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1184       {
1185          /* The regex starts after the prefix */
1186          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1187
1188          pattern->flags |= tag_pattern[i].flag;
1189
1190          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1191             &pattern->pattern.tag_regex);
1192       }
1193    }
1194
1195    /* If it isn't a tag pattern it must be an URL pattern. */
1196    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1197
1198    return compile_url_pattern(pattern, buf);
1199
1200 }
1201
1202
1203 /*********************************************************************
1204  *
1205  * Function    :  free_pattern_spec
1206  *
1207  * Description :  Called from the "unloaders".  Freez the pattern
1208  *                structure elements.
1209  *
1210  * Parameters  :
1211  *          1  :  pattern = pointer to a pattern_spec structure.
1212  *
1213  * Returns     :  N/A
1214  *
1215  *********************************************************************/
1216 void free_pattern_spec(struct pattern_spec *pattern)
1217 {
1218    if (pattern == NULL) return;
1219
1220    freez(pattern->spec);
1221 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1222    if (pattern->pattern.url_spec.host_regex)
1223    {
1224       regfree(pattern->pattern.url_spec.host_regex);
1225       freez(pattern->pattern.url_spec.host_regex);
1226    }
1227 #else
1228    freez(pattern->pattern.url_spec.dbuffer);
1229    freez(pattern->pattern.url_spec.dvec);
1230    pattern->pattern.url_spec.dcount = 0;
1231 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1232    freez(pattern->pattern.url_spec.port_list);
1233    if (pattern->pattern.url_spec.preg)
1234    {
1235       regfree(pattern->pattern.url_spec.preg);
1236       freez(pattern->pattern.url_spec.preg);
1237    }
1238    if (pattern->pattern.tag_regex)
1239    {
1240       regfree(pattern->pattern.tag_regex);
1241       freez(pattern->pattern.tag_regex);
1242    }
1243 }
1244
1245
1246 /*********************************************************************
1247  *
1248  * Function    :  port_matches
1249  *
1250  * Description :  Compares a port against a port list.
1251  *
1252  * Parameters  :
1253  *          1  :  port      = The port to check.
1254  *          2  :  port_list = The list of port to compare with.
1255  *
1256  * Returns     :  TRUE for yes, FALSE otherwise.
1257  *
1258  *********************************************************************/
1259 static int port_matches(const int port, const char *port_list)
1260 {
1261    return ((NULL == port_list) || match_portlist(port_list, port));
1262 }
1263
1264
1265 /*********************************************************************
1266  *
1267  * Function    :  host_matches
1268  *
1269  * Description :  Compares a host against a host pattern.
1270  *
1271  * Parameters  :
1272  *          1  :  url = The URL to match
1273  *          2  :  pattern = The URL pattern
1274  *
1275  * Returns     :  TRUE for yes, FALSE otherwise.
1276  *
1277  *********************************************************************/
1278 static int host_matches(const struct http_request *http,
1279                         const struct pattern_spec *pattern)
1280 {
1281    assert(http->host != NULL);
1282 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1283    return ((NULL == pattern->pattern.url_spec.host_regex)
1284       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1285 #else
1286    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1287 #endif
1288 }
1289
1290
1291 /*********************************************************************
1292  *
1293  * Function    :  path_matches
1294  *
1295  * Description :  Compares a path against a path pattern.
1296  *
1297  * Parameters  :
1298  *          1  :  path = The path to match
1299  *          2  :  pattern = The URL pattern
1300  *
1301  * Returns     :  TRUE for yes, FALSE otherwise.
1302  *
1303  *********************************************************************/
1304 static int path_matches(const char *path, const struct pattern_spec *pattern)
1305 {
1306    return ((NULL == pattern->pattern.url_spec.preg)
1307       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1308 }
1309
1310
1311 /*********************************************************************
1312  *
1313  * Function    :  url_match
1314  *
1315  * Description :  Compare a URL against a URL pattern.
1316  *
1317  * Parameters  :
1318  *          1  :  pattern = a URL pattern
1319  *          2  :  url = URL to match
1320  *
1321  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1322  *
1323  *********************************************************************/
1324 int url_match(const struct pattern_spec *pattern,
1325               const struct http_request *http)
1326 {
1327    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1328    {
1329       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1330       return 0;
1331    }
1332
1333    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1334       && host_matches(http, pattern) && path_matches(http->path, pattern));
1335
1336 }
1337
1338
1339 /*********************************************************************
1340  *
1341  * Function    :  match_portlist
1342  *
1343  * Description :  Check if a given number is covered by a comma
1344  *                separated list of numbers and ranges (a,b-c,d,..)
1345  *
1346  * Parameters  :
1347  *          1  :  portlist = String with list
1348  *          2  :  port = port to check
1349  *
1350  * Returns     :  0 => no match
1351  *                1 => match
1352  *
1353  *********************************************************************/
1354 int match_portlist(const char *portlist, int port)
1355 {
1356    char *min, *max, *next, *portlist_copy;
1357
1358    min = portlist_copy = strdup_or_die(portlist);
1359
1360    /*
1361     * Zero-terminate first item and remember offset for next
1362     */
1363    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1364    {
1365       *next++ = '\0';
1366    }
1367
1368    /*
1369     * Loop through all items, checking for match
1370     */
1371    while (NULL != min)
1372    {
1373       if (NULL == (max = strchr(min, (int) '-')))
1374       {
1375          /*
1376           * No dash, check for equality
1377           */
1378          if (port == atoi(min))
1379          {
1380             freez(portlist_copy);
1381             return(1);
1382          }
1383       }
1384       else
1385       {
1386          /*
1387           * This is a range, so check if between min and max,
1388           * or, if max was omitted, between min and 65K
1389           */
1390          *max++ = '\0';
1391          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1392          {
1393             freez(portlist_copy);
1394             return(1);
1395          }
1396
1397       }
1398
1399       /*
1400        * Jump to next item
1401        */
1402       min = next;
1403
1404       /*
1405        * Zero-terminate next item and remember offset for n+1
1406        */
1407       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1408       {
1409          *next++ = '\0';
1410       }
1411    }
1412
1413    freez(portlist_copy);
1414    return 0;
1415
1416 }
1417
1418
1419 /*********************************************************************
1420  *
1421  * Function    :  parse_forwarder_address
1422  *
1423  * Description :  Parse out the host and port from a forwarder address.
1424  *
1425  * Parameters  :
1426  *          1  :  address = The forwarder address to parse.
1427  *          2  :  hostname = Used to return the hostname. NULL on error.
1428  *          3  :  port = Used to return the port. Untouched if no port
1429  *                       is specified.
1430  *
1431  * Returns     :  JB_ERR_OK on success
1432  *                JB_ERR_MEMORY on out of memory
1433  *                JB_ERR_PARSE on malformed address.
1434  *
1435  *********************************************************************/
1436 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1437 {
1438    char *p = address;
1439
1440    if ((*address == '[') && (NULL == strchr(address, ']')))
1441    {
1442       /* XXX: Should do some more validity checks here. */
1443       return JB_ERR_PARSE;
1444    }
1445
1446    *hostname = strdup_or_die(address);
1447
1448    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1449    {
1450       *p++ = '\0';
1451       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1452       if (*p == ':')
1453       {
1454          *port = (int)strtol(++p, NULL, 0);
1455       }
1456    }
1457    else if (NULL != (p = strchr(*hostname, ':')))
1458    {
1459       *p++ = '\0';
1460       *port = (int)strtol(p, NULL, 0);
1461    }
1462
1463    return JB_ERR_OK;
1464
1465 }
1466
1467
1468 /*
1469   Local Variables:
1470   tab-width: 3
1471   end:
1472 */