urlmatch.c

   1 /*********************************************************************
   2  *
   3  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   4  *
   5  * Purpose     :  Declares functions to match URLs against URL
   6  *                patterns.
   7  *
   8  * Copyright   :  Written by and Copyright (C) 2001-2020
   9  *                the Privoxy team. https://www.privoxy.org/
  10  *
  11  *                Based on the Internet Junkbuster originally written
  12  *                by and Copyright (C) 1997 Anonymous Coders and
  13  *                Junkbusters Corporation.  http://www.junkbusters.com
  14  *
  15  *                This program is free software; you can redistribute it
  16  *                and/or modify it under the terms of the GNU General
  17  *                Public License as published by the Free Software
  18  *                Foundation; either version 2 of the License, or (at
  19  *                your option) any later version.
  20  *
  21  *                This program is distributed in the hope that it will
  22  *                be useful, but WITHOUT ANY WARRANTY; without even the
  23  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  24  *                PARTICULAR PURPOSE.  See the GNU General Public
  25  *                License for more details.
  26  *
  27  *                The GNU General Public License should be included with
  28  *                this file.  If not, you can view it at
  29  *                http://www.gnu.org/copyleft/gpl.html
  30  *                or write to the Free Software Foundation, Inc., 59
  31  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32  *
  33  *********************************************************************/
  34
  35
  36 #include "config.h"
  37
  38 #ifndef _WIN32
  39 #include <stdio.h>
  40 #include <sys/types.h>
  41 #endif
  42
  43 #include <stdlib.h>
  44 #include <ctype.h>
  45 #include <assert.h>
  46 #include <string.h>
  47
  48 #if !defined(_WIN32)
  49 #include <unistd.h>
  50 #endif
  51
  52 #include "project.h"
  53 #include "urlmatch.h"
  54 #include "ssplit.h"
  55 #include "miscutil.h"
  56 #include "errlog.h"
  57
  58 enum regex_anchoring
  59 {
  60    NO_ANCHORING,
  61    LEFT_ANCHORED,
  62    RIGHT_ANCHORED,
  63    RIGHT_ANCHORED_HOST
  64 };
  65 static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern);
  66 #ifdef FEATURE_PCRE_HOST_PATTERNS
  67 static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern);
  68 #endif
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->version);
  94    freez(http->host_ip_addr_str);
  95    freez(http->dbuffer);
  96    freez(http->dvec);
  97    http->dcount = 0;
  98 }
  99
 100
 101 /*********************************************************************
 102  *
 103  * Function    :  init_domain_components
 104  *
 105  * Description :  Splits the domain name so we can compare it
 106  *                against wildcards. It used to be part of
 107  *                parse_http_url, but was separated because the
 108  *                same code is required in chat in case of
 109  *                intercepted requests.
 110  *
 111  * Parameters  :
 112  *          1  :  http = pointer to the http structure to hold elements.
 113  *
 114  * Returns     :  JB_ERR_OK on success
 115  *                JB_ERR_PARSE on malformed command/URL
 116  *                             or >100 domains deep.
 117  *
 118  *********************************************************************/
 119 jb_err init_domain_components(struct http_request *http)
 120 {
 121    char *vec[BUFFER_SIZE];
 122    size_t size;
 123    char *p;
 124
 125    http->dbuffer = strdup_or_die(http->host);
 126
 127    /* map to lower case */
 128    for (p = http->dbuffer; *p ; p++)
 129    {
 130       *p = (char)privoxy_tolower(*p);
 131    }
 132
 133    /* split the domain name into components */
 134    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 135
 136    if (http->dcount <= 0)
 137    {
 138       /*
 139        * Error: More than SZ(vec) components in domain
 140        *    or: no components in domain
 141        */
 142       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 143       return JB_ERR_PARSE;
 144    }
 145
 146    /* save a copy of the pointers in dvec */
 147    size = (size_t)http->dcount * sizeof(*http->dvec);
 148
 149    http->dvec = malloc_or_die(size);
 150
 151    memcpy(http->dvec, vec, size);
 152
 153    return JB_ERR_OK;
 154 }
 155
 156
 157 /*********************************************************************
 158  *
 159  * Function    :  url_requires_percent_encoding
 160  *
 161  * Description :  Checks if an URL contains invalid characters
 162  *                according to RFC 3986 that should be percent-encoded.
 163  *                Does not verify whether or not the passed string
 164  *                actually is a valid URL.
 165  *
 166  * Parameters  :
 167  *          1  :  url = URL to check
 168  *
 169  * Returns     :  True in case of valid URLs, false otherwise
 170  *
 171  *********************************************************************/
 172 int url_requires_percent_encoding(const char *url)
 173 {
 174    static const char allowed_characters[128] = {
 175       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 176       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 177       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 178       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 179       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 180       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 181       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 182       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 183       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 184       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 185       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 186       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 187       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 188    };
 189
 190    while (*url != '\0')
 191    {
 192       const unsigned int i = (unsigned char)*url++;
 193       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 194       {
 195          return TRUE;
 196       }
 197    }
 198
 199    return FALSE;
 200
 201 }
 202
 203
 204 /*********************************************************************
 205  *
 206  * Function    :  parse_http_url
 207  *
 208  * Description :  Parse out the host and port from the URL.  Find the
 209  *                hostname & path, port (if ':'), and/or password (if '@')
 210  *
 211  * Parameters  :
 212  *          1  :  url = URL (or is it URI?) to break down
 213  *          2  :  http = pointer to the http structure to hold elements.
 214  *                       Must be initialized with valid values (like NULLs).
 215  *          3  :  require_protocol = Whether or not URLs without
 216  *                                   protocol are acceptable.
 217  *
 218  * Returns     :  JB_ERR_OK on success
 219  *                JB_ERR_PARSE on malformed command/URL
 220  *                             or >100 domains deep.
 221  *
 222  *********************************************************************/
 223 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 224 {
 225    int host_available = 1; /* A proxy can dream. */
 226
 227    /*
 228     * Save our initial URL
 229     */
 230    http->url = strdup_or_die(url);
 231
 232    /*
 233     * Check for * URI. If found, we're done.
 234     */
 235    if (*http->url == '*')
 236    {
 237       http->path = strdup_or_die("*");
 238       http->hostport = strdup_or_die("");
 239       if (http->url[1] != '\0')
 240       {
 241          return JB_ERR_PARSE;
 242       }
 243       return JB_ERR_OK;
 244    }
 245
 246
 247    /*
 248     * Split URL into protocol,hostport,path.
 249     */
 250    {
 251       char *buf;
 252       char *url_noproto;
 253       char *url_path;
 254
 255       buf = strdup_or_die(url);
 256
 257       /* Find the start of the URL in our scratch space */
 258       url_noproto = buf;
 259       if (strncmpic(url_noproto, "http://",  7) == 0)
 260       {
 261          url_noproto += 7;
 262       }
 263       else if (strncmpic(url_noproto, "https://", 8) == 0)
 264       {
 265          /*
 266           * Should only happen when called from cgi_show_url_info().
 267           */
 268          url_noproto += 8;
 269          http->ssl = 1;
 270       }
 271       else if (*url_noproto == '/')
 272       {
 273         /*
 274          * Short request line without protocol and host.
 275          * Most likely because the client's request
 276          * was intercepted and redirected into Privoxy.
 277          */
 278          http->host = NULL;
 279          host_available = 0;
 280       }
 281       else if (require_protocol)
 282       {
 283          freez(buf);
 284          return JB_ERR_PARSE;
 285       }
 286
 287       url_path = strchr(url_noproto, '/');
 288       if (url_path != NULL)
 289       {
 290          /*
 291           * Got a path.
 292           *
 293           * If FEATURE_HTTPS_INSPECTION isn't available, ignore the
 294           * path for https URLs so that we get consistent behaviour
 295           * if a https URL is parsed. When the URL is actually
 296           * retrieved, https hides the path part.
 297           */
 298          http->path = strdup_or_die(
 299 #ifndef FEATURE_HTTPS_INSPECTION
 300             http->ssl ? "/" :
 301 #endif
 302             url_path
 303          );
 304          *url_path = '\0';
 305          http->hostport = string_tolower(url_noproto);
 306       }
 307       else
 308       {
 309          /*
 310           * Repair broken HTTP requests that don't contain a path,
 311           * or CONNECT requests
 312           */
 313          http->path = strdup_or_die("/");
 314          http->hostport = string_tolower(url_noproto);
 315       }
 316
 317       freez(buf);
 318
 319       if (http->hostport == NULL)
 320       {
 321          return JB_ERR_PARSE;
 322       }
 323    }
 324
 325    if (!host_available)
 326    {
 327       /* Without host, there is nothing left to do here */
 328       return JB_ERR_OK;
 329    }
 330
 331    /*
 332     * Split hostport into user/password (ignored), host, port.
 333     */
 334    {
 335       char *buf;
 336       char *host;
 337       char *port;
 338
 339       buf = strdup_or_die(http->hostport);
 340
 341       /* check if url contains username and/or password */
 342       host = strchr(buf, '@');
 343       if (host != NULL)
 344       {
 345          /* Contains username/password, skip it and the @ sign. */
 346          host++;
 347       }
 348       else
 349       {
 350          /* No username or password. */
 351          host = buf;
 352       }
 353
 354       /* Move after hostname before port number */
 355       if (*host == '[')
 356       {
 357          /* Numeric IPv6 address delimited by brackets */
 358          host++;
 359          port = strchr(host, ']');
 360
 361          if (port == NULL)
 362          {
 363             /* Missing closing bracket */
 364             freez(buf);
 365             return JB_ERR_PARSE;
 366          }
 367
 368          *port++ = '\0';
 369
 370          if (*port == '\0')
 371          {
 372             port = NULL;
 373          }
 374          else if (*port != ':')
 375          {
 376             /* Garbage after closing bracket */
 377             freez(buf);
 378             return JB_ERR_PARSE;
 379          }
 380       }
 381       else
 382       {
 383          /* Plain non-escaped hostname */
 384          port = strchr(host, ':');
 385       }
 386
 387       /* check if url contains port */
 388       if (port != NULL)
 389       {
 390          /* Contains port */
 391          char *endptr;
 392          long parsed_port;
 393          /* Terminate hostname and point to start of port string */
 394          *port++ = '\0';
 395          parsed_port = strtol(port, &endptr, 10);
 396          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 397          {
 398             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 399             freez(buf);
 400             return JB_ERR_PARSE;
 401          }
 402          http->port = (int)parsed_port;
 403       }
 404       else
 405       {
 406          /* No port specified. */
 407          http->port = (http->ssl ? 443 : 80);
 408       }
 409
 410       http->host = strdup_or_die(host);
 411
 412       freez(buf);
 413    }
 414
 415    /* Split domain name so we can compare it against wildcards */
 416    return init_domain_components(http);
 417
 418 }
 419
 420
 421 /*********************************************************************
 422  *
 423  * Function    :  unknown_method
 424  *
 425  * Description :  Checks whether a method is unknown.
 426  *
 427  * Parameters  :
 428  *          1  :  method = points to a http method
 429  *
 430  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 431  *
 432  *********************************************************************/
 433 static int unknown_method(const char *method)
 434 {
 435    static const char * const known_http_methods[] = {
 436       /* Basic HTTP request type */
 437       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 438       /* webDAV extensions (RFC2518) */
 439       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 440       /*
 441        * Microsoft webDAV extension for Exchange 2000.  See:
 442        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 443        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 444        */
 445       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 446       /*
 447        * Another Microsoft webDAV extension for Exchange 2000.  See:
 448        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 449        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 450        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 451        */
 452       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 453       /*
 454        * Yet another WebDAV extension, this time for
 455        * Web Distributed Authoring and Versioning (RFC3253)
 456        */
 457       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 458       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 459       /*
 460        * The PATCH method is defined by RFC5789, the format of the
 461        * actual patch in the body depends on the application, but from
 462        * Privoxy's point of view it doesn't matter.
 463        */
 464       "PATCH",
 465    };
 466    int i;
 467
 468    for (i = 0; i < SZ(known_http_methods); i++)
 469    {
 470       if (0 == strcmpic(method, known_http_methods[i]))
 471       {
 472          return FALSE;
 473       }
 474    }
 475
 476    return TRUE;
 477
 478 }
 479
 480
 481 /*********************************************************************
 482  *
 483  * Function    :  normalize_http_version
 484  *
 485  * Description :  Take a supported HTTP version string and remove
 486  *                leading zeroes etc., reject unsupported versions.
 487  *
 488  *                This is an explicit RFC 2616 (3.1) MUST and
 489  *                RFC 7230 mandates that intermediaries send their
 490  *                own HTTP-version in forwarded messages.
 491  *
 492  * Parameters  :
 493  *          1  :  http_version = HTTP version string
 494  *
 495  * Returns     :  JB_ERR_OK on success
 496  *                JB_ERR_PARSE if the HTTP version is unsupported
 497  *
 498  *********************************************************************/
 499 static jb_err normalize_http_version(char *http_version)
 500 {
 501    unsigned int major_version;
 502    unsigned int minor_version;
 503
 504    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 505    {
 506       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 507       return JB_ERR_PARSE;
 508    }
 509
 510    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 511    {
 512       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 513          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 514       return JB_ERR_PARSE;
 515    }
 516
 517    assert(strlen(http_version) >= 8);
 518    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 519
 520    return JB_ERR_OK;
 521
 522 }
 523
 524
 525 /*********************************************************************
 526  *
 527  * Function    :  parse_http_request
 528  *
 529  * Description :  Parse out the host and port from the URL.  Find the
 530  *                hostname & path, port (if ':'), and/or password (if '@')
 531  *
 532  * Parameters  :
 533  *          1  :  req = HTTP request line to break down
 534  *          2  :  http = pointer to the http structure to hold elements
 535  *
 536  * Returns     :  JB_ERR_OK on success
 537  *                JB_ERR_CGI_PARAMS on malformed command/URL
 538  *                                  or >100 domains deep.
 539  *
 540  *********************************************************************/
 541 jb_err parse_http_request(const char *req, struct http_request *http)
 542 {
 543    char *buf;
 544    char *v[3];
 545    int n;
 546    jb_err err;
 547
 548    memset(http, '\0', sizeof(*http));
 549
 550    buf = strdup_or_die(req);
 551
 552    n = ssplit(buf, " \r\n", v, SZ(v));
 553    if (n != 3)
 554    {
 555       freez(buf);
 556       return JB_ERR_PARSE;
 557    }
 558
 559    /*
 560     * Fail in case of unknown methods
 561     * which we might not handle correctly.
 562     *
 563     * XXX: There should be a config option
 564     * to forward requests with unknown methods
 565     * anyway. Most of them don't need special
 566     * steps.
 567     */
 568    if (unknown_method(v[0]))
 569    {
 570       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 571       freez(buf);
 572       return JB_ERR_PARSE;
 573    }
 574
 575    if (JB_ERR_OK != normalize_http_version(v[2]))
 576    {
 577       freez(buf);
 578       return JB_ERR_PARSE;
 579    }
 580
 581    http->ssl = !strcmpic(v[0], "CONNECT");
 582
 583    err = parse_http_url(v[1], http, !http->ssl);
 584    if (err)
 585    {
 586       freez(buf);
 587       return err;
 588    }
 589
 590    /*
 591     * Copy the details into the structure
 592     */
 593    http->cmd = strdup_or_die(req);
 594    http->gpc = strdup_or_die(v[0]);
 595    http->version = strdup_or_die(v[2]);
 596    http->ocmd = strdup_or_die(http->cmd);
 597
 598    freez(buf);
 599
 600    return JB_ERR_OK;
 601
 602 }
 603
 604
 605 /*********************************************************************
 606  *
 607  * Function    :  compile_pattern
 608  *
 609  * Description :  Compiles a host, domain or TAG pattern.
 610  *
 611  * Parameters  :
 612  *          1  :  pattern = The pattern to compile.
 613  *          2  :  anchoring = How the regex should be modified
 614  *                            before compilation. Can be either
 615  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 616  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 617  *          3  :  url     = In case of failures, the spec member is
 618  *                          logged and the structure freed.
 619  *          4  :  regex   = Where the compiled regex should be stored.
 620  *
 621  * Returns     :  JB_ERR_OK - Success
 622  *                JB_ERR_PARSE - Cannot parse regex
 623  *
 624  *********************************************************************/
 625 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 626                               struct pattern_spec *url, regex_t **regex)
 627 {
 628    int errcode;
 629    const char *fmt = NULL;
 630    char *rebuf;
 631    size_t rebuf_size;
 632
 633    assert(pattern);
 634
 635    if (pattern[0] == '\0')
 636    {
 637       *regex = NULL;
 638       return JB_ERR_OK;
 639    }
 640
 641    switch (anchoring)
 642    {
 643       case NO_ANCHORING:
 644          fmt = "%s";
 645          break;
 646       case RIGHT_ANCHORED:
 647          fmt = "%s$";
 648          break;
 649       case RIGHT_ANCHORED_HOST:
 650          fmt = "%s\\.?$";
 651          break;
 652       case LEFT_ANCHORED:
 653          fmt = "^%s";
 654          break;
 655       default:
 656          log_error(LOG_LEVEL_FATAL,
 657             "Invalid anchoring in compile_pattern %d", anchoring);
 658    }
 659    rebuf_size = strlen(pattern) + strlen(fmt);
 660    rebuf = malloc_or_die(rebuf_size);
 661    *regex = zalloc_or_die(sizeof(**regex));
 662
 663    snprintf(rebuf, rebuf_size, fmt, pattern);
 664
 665    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 666
 667    if (errcode)
 668    {
 669       size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
 670       if (errlen > (rebuf_size - (size_t)1))
 671       {
 672          errlen = rebuf_size - (size_t)1;
 673       }
 674       rebuf[errlen] = '\0';
 675       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 676          pattern, url->spec, rebuf);
 677       free_pattern_spec(url);
 678       freez(rebuf);
 679
 680       return JB_ERR_PARSE;
 681    }
 682    freez(rebuf);
 683
 684    return JB_ERR_OK;
 685
 686 }
 687
 688
 689 /*********************************************************************
 690  *
 691  * Function    :  compile_url_pattern
 692  *
 693  * Description :  Compiles the three parts of an URL pattern.
 694  *
 695  * Parameters  :
 696  *          1  :  url = Target pattern_spec to be filled in.
 697  *          2  :  buf = The url pattern to compile. Will be messed up.
 698  *
 699  * Returns     :  JB_ERR_OK - Success
 700  *                JB_ERR_MEMORY - Out of memory
 701  *                JB_ERR_PARSE - Cannot parse regex
 702  *
 703  *********************************************************************/
 704 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 705 {
 706    char *p;
 707
 708 #ifdef FEATURE_PCRE_HOST_PATTERNS
 709    const size_t prefix_length = 18;
 710    if (strncmpic(buf, "PCRE-HOST-PATTERN:", prefix_length) == 0)
 711    {
 712       url->pattern.url_spec.host_regex_type = PCRE_HOST_PATTERN;
 713       /* Overwrite the "PCRE-HOST-PATTERN:" prefix */
 714       memmove(buf, buf+prefix_length, strlen(buf+prefix_length)+1);
 715    }
 716    else
 717    {
 718       url->pattern.url_spec.host_regex_type = VANILLA_HOST_PATTERN;
 719    }
 720 #endif
 721
 722    p = strchr(buf, '/');
 723    if (NULL != p)
 724    {
 725       /*
 726        * Only compile the regex if it consists of more than
 727        * a single slash, otherwise it wouldn't affect the result.
 728        */
 729       if (p[1] != '\0')
 730       {
 731          /*
 732           * XXX: does it make sense to compile the slash at the beginning?
 733           */
 734          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 735
 736          if (JB_ERR_OK != err)
 737          {
 738             return err;
 739          }
 740       }
 741       *p = '\0';
 742    }
 743
 744    /*
 745     * IPv6 numeric hostnames can contain colons, thus we need
 746     * to delimit the hostname before the real port separator.
 747     * As brackets are already used in the hostname pattern,
 748     * we use angle brackets ('<', '>') instead.
 749     */
 750    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 751    {
 752       *p++ = '\0';
 753       buf++;
 754
 755       if (*p == '\0')
 756       {
 757          /* IPv6 address without port number */
 758          p = NULL;
 759       }
 760       else if (*p != ':')
 761       {
 762          /* Garbage after address delimiter */
 763          return JB_ERR_PARSE;
 764       }
 765    }
 766    else
 767    {
 768       p = strchr(buf, ':');
 769    }
 770
 771    if (NULL != p)
 772    {
 773       *p++ = '\0';
 774       url->pattern.url_spec.port_list = strdup_or_die(p);
 775    }
 776    else
 777    {
 778       url->pattern.url_spec.port_list = NULL;
 779    }
 780
 781    if (buf[0] != '\0')
 782    {
 783 #ifdef FEATURE_PCRE_HOST_PATTERNS
 784       if (url->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
 785       {
 786          return compile_pcre_host_pattern(url, buf);
 787       }
 788       else
 789 #endif
 790       {
 791          return compile_vanilla_host_pattern(url, buf);
 792       }
 793    }
 794
 795    return JB_ERR_OK;
 796
 797 }
 798
 799
 800 #ifdef FEATURE_PCRE_HOST_PATTERNS
 801 /*********************************************************************
 802  *
 803  * Function    :  compile_pcre_host_pattern
 804  *
 805  * Description :  Parses and compiles a pcre host pattern.
 806  *
 807  * Parameters  :
 808  *          1  :  url = Target pattern_spec to be filled in.
 809  *          2  :  host_pattern = Host pattern to compile.
 810  *
 811  * Returns     :  JB_ERR_OK - Success
 812  *                JB_ERR_MEMORY - Out of memory
 813  *                JB_ERR_PARSE - Cannot parse regex
 814  *
 815  *********************************************************************/
 816 static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern)
 817 {
 818    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 819 }
 820 #endif /* def FEATURE_PCRE_HOST_PATTERNS */
 821
 822
 823 /*********************************************************************
 824  *
 825  * Function    :  compile_vanilla_host_pattern
 826  *
 827  * Description :  Parses and "compiles" an old-school host pattern.
 828  *
 829  * Parameters  :
 830  *          1  :  url = Target pattern_spec to be filled in.
 831  *          2  :  host_pattern = Host pattern to parse.
 832  *
 833  * Returns     :  JB_ERR_OK - Success
 834  *                JB_ERR_PARSE - Cannot parse regex
 835  *
 836  *********************************************************************/
 837 static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern)
 838 {
 839    char *v[150];
 840    size_t size;
 841    char *p;
 842
 843    /*
 844     * Parse domain part
 845     */
 846    if (host_pattern[strlen(host_pattern) - 1] == '.')
 847    {
 848       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 849    }
 850    if (host_pattern[0] == '.')
 851    {
 852       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 853    }
 854
 855    /*
 856     * Split domain into components
 857     */
 858    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 859
 860    /*
 861     * Map to lower case
 862     */
 863    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 864    {
 865       *p = (char)privoxy_tolower(*p);
 866    }
 867
 868    /*
 869     * Split the domain name into components
 870     */
 871    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 872
 873    if (url->pattern.url_spec.dcount < 0)
 874    {
 875       free_pattern_spec(url);
 876       return JB_ERR_PARSE;
 877    }
 878    else if (url->pattern.url_spec.dcount != 0)
 879    {
 880       /*
 881        * Save a copy of the pointers in dvec
 882        */
 883       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 884
 885       url->pattern.url_spec.dvec = malloc_or_die(size);
 886
 887       memcpy(url->pattern.url_spec.dvec, v, size);
 888    }
 889    /*
 890     * else dcount == 0 in which case we needn't do anything,
 891     * since dvec will never be accessed and the pattern will
 892     * match all domains.
 893     */
 894    return JB_ERR_OK;
 895 }
 896
 897
 898 /*********************************************************************
 899  *
 900  * Function    :  simplematch
 901  *
 902  * Description :  String matching, with a (greedy) '*' wildcard that
 903  *                stands for zero or more arbitrary characters and
 904  *                character classes in [], which take both enumerations
 905  *                and ranges.
 906  *
 907  * Parameters  :
 908  *          1  :  pattern = pattern for matching
 909  *          2  :  text    = text to be matched
 910  *
 911  * Returns     :  0 if match, else nonzero
 912  *
 913  *********************************************************************/
 914 static int simplematch(const char *pattern, const char *text)
 915 {
 916    const unsigned char *pat = (const unsigned char *)pattern;
 917    const unsigned char *txt = (const unsigned char *)text;
 918    const unsigned char *fallback = pat;
 919    int wildcard = 0;
 920
 921    unsigned char lastchar = 'a';
 922    unsigned i;
 923    unsigned char charmap[32];
 924
 925    while (*txt)
 926    {
 927
 928       /* EOF pattern but !EOF text? */
 929       if (*pat == '\0')
 930       {
 931          if (wildcard)
 932          {
 933             pat = fallback;
 934          }
 935          else
 936          {
 937             return 1;
 938          }
 939       }
 940
 941       /* '*' in the pattern?  */
 942       if (*pat == '*')
 943       {
 944
 945          /* The pattern ends afterwards? Speed up the return. */
 946          if (*++pat == '\0')
 947          {
 948             return 0;
 949          }
 950
 951          /* Else, set wildcard mode and remember position after '*' */
 952          wildcard = 1;
 953          fallback = pat;
 954       }
 955
 956       /* Character range specification? */
 957       if (*pat == '[')
 958       {
 959          memset(charmap, '\0', sizeof(charmap));
 960
 961          while (*++pat != ']')
 962          {
 963             if (!*pat)
 964             {
 965                return 1;
 966             }
 967             else if (*pat == '-')
 968             {
 969                if ((*++pat == ']') || *pat == '\0')
 970                {
 971                   return(1);
 972                }
 973                for (i = lastchar; i <= *pat; i++)
 974                {
 975                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 976                }
 977             }
 978             else
 979             {
 980                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 981                lastchar = *pat;
 982             }
 983          }
 984       } /* -END- if Character range specification */
 985
 986
 987       /*
 988        * Char match, or char range match?
 989        */
 990       if ((*pat == *txt)
 991        || (*pat == '?')
 992        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 993       {
 994          /*
 995           * Success: Go ahead
 996           */
 997          pat++;
 998       }
 999       else if (!wildcard)
1000       {
1001          /*
1002           * No match && no wildcard: No luck
1003           */
1004          return 1;
1005       }
1006       else if (pat != fallback)
1007       {
1008          /*
1009           * Increment text pointer if in char range matching
1010           */
1011          if (*pat == ']')
1012          {
1013             txt++;
1014          }
1015          /*
1016           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
1017           */
1018          pat = fallback;
1019          /*
1020           * Restart matching from current text pointer
1021           */
1022          continue;
1023       }
1024       txt++;
1025    }
1026
1027    /* Cut off extra '*'s */
1028    if (*pat == '*') pat++;
1029
1030    /* If this is the pattern's end, fine! */
1031    return(*pat);
1032
1033 }
1034
1035
1036 /*********************************************************************
1037  *
1038  * Function    :  simple_domaincmp
1039  *
1040  * Description :  Domain-wise Compare fqdn's.  The comparison is
1041  *                both left- and right-anchored.  The individual
1042  *                domain names are compared with simplematch().
1043  *                This is only used by domain_match.
1044  *
1045  * Parameters  :
1046  *          1  :  pv = array of patterns to compare
1047  *          2  :  fv = array of domain components to compare
1048  *          3  :  len = length of the arrays (both arrays are the
1049  *                      same length - if they weren't, it couldn't
1050  *                      possibly be a match).
1051  *
1052  * Returns     :  0 => domains are equivalent, else no match.
1053  *
1054  *********************************************************************/
1055 static int simple_domaincmp(char **pv, char **fv, int len)
1056 {
1057    int n;
1058
1059    for (n = 0; n < len; n++)
1060    {
1061       if (simplematch(pv[n], fv[n]))
1062       {
1063          return 1;
1064       }
1065    }
1066
1067    return 0;
1068
1069 }
1070
1071
1072 /*********************************************************************
1073  *
1074  * Function    :  domain_match
1075  *
1076  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1077  *                p.pattern->unachored, the comparison is un-, left-,
1078  *                right-anchored, or both.
1079  *                The individual domain names are compared with
1080  *                simplematch().
1081  *
1082  * Parameters  :
1083  *          1  :  p = a domain that may contain a '*' as a wildcard.
1084  *          2  :  fqdn = domain name against which the patterns are compared.
1085  *
1086  * Returns     :  0 => domains are equivalent, else no match.
1087  *
1088  *********************************************************************/
1089 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1090 {
1091    char **pv, **fv;  /* vectors  */
1092    int    plen, flen;
1093    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1094
1095    plen = p->pattern.url_spec.dcount;
1096    flen = fqdn->dcount;
1097
1098    if (flen < plen)
1099    {
1100       /* fqdn is too short to match this pattern */
1101       return 1;
1102    }
1103
1104    pv   = p->pattern.url_spec.dvec;
1105    fv   = fqdn->dvec;
1106
1107    if (unanchored == ANCHOR_LEFT)
1108    {
1109       /*
1110        * Right anchored.
1111        *
1112        * Convert this into a fully anchored pattern with
1113        * the fqdn and pattern the same length
1114        */
1115       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1116       return simple_domaincmp(pv, fv, plen);
1117    }
1118    else if (unanchored == 0)
1119    {
1120       /* Fully anchored, check length */
1121       if (flen != plen)
1122       {
1123          return 1;
1124       }
1125       return simple_domaincmp(pv, fv, plen);
1126    }
1127    else if (unanchored == ANCHOR_RIGHT)
1128    {
1129       /* Left anchored, ignore all extra in fqdn */
1130       return simple_domaincmp(pv, fv, plen);
1131    }
1132    else
1133    {
1134       /* Unanchored */
1135       int n;
1136       int maxn = flen - plen;
1137       for (n = 0; n <= maxn; n++)
1138       {
1139          if (!simple_domaincmp(pv, fv, plen))
1140          {
1141             return 0;
1142          }
1143          /*
1144           * Doesn't match from start of fqdn
1145           * Try skipping first part of fqdn
1146           */
1147          fv++;
1148       }
1149       return 1;
1150    }
1151
1152 }
1153
1154
1155 /*********************************************************************
1156  *
1157  * Function    :  create_pattern_spec
1158  *
1159  * Description :  Creates a "pattern_spec" structure from a string.
1160  *                When finished, free with free_pattern_spec().
1161  *
1162  * Parameters  :
1163  *          1  :  pattern = Target pattern_spec to be filled in.
1164  *                          Will be zeroed before use.
1165  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1166  *                      contents of this buffer are destroyed by this
1167  *                      function.  If this function succeeds, the
1168  *                      buffer is copied to pattern->spec.  If this
1169  *                      function fails, the contents of the buffer
1170  *                      are lost forever.
1171  *
1172  * Returns     :  JB_ERR_OK - Success
1173  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1174  *                               written to system log)
1175  *
1176  *********************************************************************/
1177 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1178 {
1179    static const struct
1180    {
1181       /** The tag pattern prefix to match */
1182       const char *prefix;
1183
1184       /** The length of the prefix to match */
1185       const size_t prefix_length;
1186
1187       /** The pattern flag */
1188       const unsigned flag;
1189    } tag_pattern[] = {
1190       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1191  #ifdef FEATURE_CLIENT_TAGS
1192       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1193  #endif
1194       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1195       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1196    };
1197    int i;
1198
1199    assert(pattern);
1200    assert(buf);
1201
1202    memset(pattern, '\0', sizeof(*pattern));
1203
1204    /* Remember the original specification for the CGI pages. */
1205    pattern->spec = strdup_or_die(buf);
1206
1207    /* Check if it's a tag pattern */
1208    for (i = 0; i < SZ(tag_pattern); i++)
1209    {
1210       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1211       {
1212          /* The regex starts after the prefix */
1213          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1214
1215          pattern->flags |= tag_pattern[i].flag;
1216
1217          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1218             &pattern->pattern.tag_regex);
1219       }
1220    }
1221
1222    /* If it isn't a tag pattern it must be an URL pattern. */
1223    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1224
1225    return compile_url_pattern(pattern, buf);
1226
1227 }
1228
1229
1230 /*********************************************************************
1231  *
1232  * Function    :  free_pattern_spec
1233  *
1234  * Description :  Called from the "unloaders".  Freez the pattern
1235  *                structure elements.
1236  *
1237  * Parameters  :
1238  *          1  :  pattern = pointer to a pattern_spec structure.
1239  *
1240  * Returns     :  N/A
1241  *
1242  *********************************************************************/
1243 void free_pattern_spec(struct pattern_spec *pattern)
1244 {
1245    if (pattern == NULL) return;
1246
1247    freez(pattern->spec);
1248 #ifdef FEATURE_PCRE_HOST_PATTERNS
1249    if (pattern->pattern.url_spec.host_regex)
1250    {
1251       regfree(pattern->pattern.url_spec.host_regex);
1252       freez(pattern->pattern.url_spec.host_regex);
1253    }
1254 #endif /* def FEATURE_PCRE_HOST_PATTERNS */
1255    freez(pattern->pattern.url_spec.dbuffer);
1256    freez(pattern->pattern.url_spec.dvec);
1257    pattern->pattern.url_spec.dcount = 0;
1258    freez(pattern->pattern.url_spec.port_list);
1259    if (pattern->pattern.url_spec.preg)
1260    {
1261       regfree(pattern->pattern.url_spec.preg);
1262       freez(pattern->pattern.url_spec.preg);
1263    }
1264    if (pattern->pattern.tag_regex)
1265    {
1266       regfree(pattern->pattern.tag_regex);
1267       freez(pattern->pattern.tag_regex);
1268    }
1269 }
1270
1271
1272 /*********************************************************************
1273  *
1274  * Function    :  port_matches
1275  *
1276  * Description :  Compares a port against a port list.
1277  *
1278  * Parameters  :
1279  *          1  :  port      = The port to check.
1280  *          2  :  port_list = The list of port to compare with.
1281  *
1282  * Returns     :  TRUE for yes, FALSE otherwise.
1283  *
1284  *********************************************************************/
1285 static int port_matches(const int port, const char *port_list)
1286 {
1287    return ((NULL == port_list) || match_portlist(port_list, port));
1288 }
1289
1290
1291 /*********************************************************************
1292  *
1293  * Function    :  host_matches
1294  *
1295  * Description :  Compares a host against a host pattern.
1296  *
1297  * Parameters  :
1298  *          1  :  url = The URL to match
1299  *          2  :  pattern = The URL pattern
1300  *
1301  * Returns     :  TRUE for yes, FALSE otherwise.
1302  *
1303  *********************************************************************/
1304 static int host_matches(const struct http_request *http,
1305                         const struct pattern_spec *pattern)
1306 {
1307    assert(http->host != NULL);
1308 #ifdef FEATURE_PCRE_HOST_PATTERNS
1309    if (pattern->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
1310    {
1311       return ((NULL == pattern->pattern.url_spec.host_regex)
1312          || (0 == regexec(pattern->pattern.url_spec.host_regex,
1313                http->host, 0, NULL, 0)));
1314    }
1315 #endif
1316    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1317 }
1318
1319
1320 /*********************************************************************
1321  *
1322  * Function    :  path_matches
1323  *
1324  * Description :  Compares a path against a path pattern.
1325  *
1326  * Parameters  :
1327  *          1  :  path = The path to match
1328  *          2  :  pattern = The URL pattern
1329  *
1330  * Returns     :  TRUE for yes, FALSE otherwise.
1331  *
1332  *********************************************************************/
1333 static int path_matches(const char *path, const struct pattern_spec *pattern)
1334 {
1335    return ((NULL == pattern->pattern.url_spec.preg)
1336       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1337 }
1338
1339
1340 /*********************************************************************
1341  *
1342  * Function    :  url_match
1343  *
1344  * Description :  Compare a URL against a URL pattern.
1345  *
1346  * Parameters  :
1347  *          1  :  pattern = a URL pattern
1348  *          2  :  url = URL to match
1349  *
1350  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1351  *
1352  *********************************************************************/
1353 int url_match(const struct pattern_spec *pattern,
1354               const struct http_request *http)
1355 {
1356    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1357    {
1358       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1359       return 0;
1360    }
1361
1362    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1363       && host_matches(http, pattern) && path_matches(http->path, pattern));
1364
1365 }
1366
1367
1368 /*********************************************************************
1369  *
1370  * Function    :  match_portlist
1371  *
1372  * Description :  Check if a given number is covered by a comma
1373  *                separated list of numbers and ranges (a,b-c,d,..)
1374  *
1375  * Parameters  :
1376  *          1  :  portlist = String with list
1377  *          2  :  port = port to check
1378  *
1379  * Returns     :  0 => no match
1380  *                1 => match
1381  *
1382  *********************************************************************/
1383 int match_portlist(const char *portlist, int port)
1384 {
1385    char *min, *max, *next, *portlist_copy;
1386
1387    min = portlist_copy = strdup_or_die(portlist);
1388
1389    /*
1390     * Zero-terminate first item and remember offset for next
1391     */
1392    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1393    {
1394       *next++ = '\0';
1395    }
1396
1397    /*
1398     * Loop through all items, checking for match
1399     */
1400    while (NULL != min)
1401    {
1402       if (NULL == (max = strchr(min, (int) '-')))
1403       {
1404          /*
1405           * No dash, check for equality
1406           */
1407          if (port == atoi(min))
1408          {
1409             freez(portlist_copy);
1410             return(1);
1411          }
1412       }
1413       else
1414       {
1415          /*
1416           * This is a range, so check if between min and max,
1417           * or, if max was omitted, between min and 65K
1418           */
1419          *max++ = '\0';
1420          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1421          {
1422             freez(portlist_copy);
1423             return(1);
1424          }
1425
1426       }
1427
1428       /*
1429        * Jump to next item
1430        */
1431       min = next;
1432
1433       /*
1434        * Zero-terminate next item and remember offset for n+1
1435        */
1436       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1437       {
1438          *next++ = '\0';
1439       }
1440    }
1441
1442    freez(portlist_copy);
1443    return 0;
1444
1445 }
1446
1447
1448 /*********************************************************************
1449  *
1450  * Function    :  parse_forwarder_address
1451  *
1452  * Description :  Parse out the username, password, host and port from
1453  *                a forwarder address.
1454  *
1455  * Parameters  :
1456  *          1  :  address = The forwarder address to parse.
1457  *          2  :  hostname = Used to return the hostname. NULL on error.
1458  *          3  :  port = Used to return the port. Untouched if no port
1459  *                       is specified.
1460  *          4  :  username = Used to return the username if any.
1461  *          5  :  password = Used to return the password if any.
1462  *
1463  * Returns     :  JB_ERR_OK on success
1464  *                JB_ERR_MEMORY on out of memory
1465  *                JB_ERR_PARSE on malformed address.
1466  *
1467  *********************************************************************/
1468 jb_err parse_forwarder_address(char *address, char **hostname, int *port,
1469                                char **username, char **password)
1470 {
1471    char *p;
1472    char *tmp;
1473
1474    tmp = *hostname = strdup_or_die(address);
1475
1476    /* Parse username and password */
1477    if (username && password && (NULL != (p = strchr(*hostname, '@'))))
1478    {
1479       *p++ = '\0';
1480       *username = strdup_or_die(*hostname);
1481       *hostname = strdup_or_die(p);
1482
1483       if (NULL != (p = strchr(*username, ':')))
1484       {
1485          *p++ = '\0';
1486          *password = strdup_or_die(p);
1487       }
1488       freez(tmp);
1489    }
1490
1491    /* Parse hostname and port */
1492    p = *hostname;
1493    if ((*p == '[') && (NULL == strchr(p, ']')))
1494    {
1495       /* XXX: Should do some more validity checks here. */
1496       return JB_ERR_PARSE;
1497    }
1498
1499    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1500    {
1501       *p++ = '\0';
1502       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1503       if (*p == ':')
1504       {
1505          *port = (int)strtol(++p, NULL, 0);
1506       }
1507    }
1508    else if (NULL != (p = strchr(*hostname, ':')))
1509    {
1510       *p++ = '\0';
1511       *port = (int)strtol(p, NULL, 0);
1512    }
1513
1514    return JB_ERR_OK;
1515
1516 }
1517
1518
1519 /*
1520   Local Variables:
1521   tab-width: 3
1522   end:
1523 */