urlmatch.c

   1 /*********************************************************************
   2  *
   3  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   4  *
   5  * Purpose     :  Declares functions to match URLs against URL
   6  *                patterns.
   7  *
   8  * Copyright   :  Written by and Copyright (C) 2001-2014
   9  *                the Privoxy team. https://www.privoxy.org/
  10  *
  11  *                Based on the Internet Junkbuster originally written
  12  *                by and Copyright (C) 1997 Anonymous Coders and
  13  *                Junkbusters Corporation.  http://www.junkbusters.com
  14  *
  15  *                This program is free software; you can redistribute it
  16  *                and/or modify it under the terms of the GNU General
  17  *                Public License as published by the Free Software
  18  *                Foundation; either version 2 of the License, or (at
  19  *                your option) any later version.
  20  *
  21  *                This program is distributed in the hope that it will
  22  *                be useful, but WITHOUT ANY WARRANTY; without even the
  23  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  24  *                PARTICULAR PURPOSE.  See the GNU General Public
  25  *                License for more details.
  26  *
  27  *                The GNU General Public License should be included with
  28  *                this file.  If not, you can view it at
  29  *                http://www.gnu.org/copyleft/gpl.html
  30  *                or write to the Free Software Foundation, Inc., 59
  31  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32  *
  33  *********************************************************************/
  34
  35
  36 #include "config.h"
  37
  38 #ifndef _WIN32
  39 #include <stdio.h>
  40 #include <sys/types.h>
  41 #endif
  42
  43 #include <stdlib.h>
  44 #include <ctype.h>
  45 #include <assert.h>
  46 #include <string.h>
  47
  48 #if !defined(_WIN32) && !defined(__OS2__)
  49 #include <unistd.h>
  50 #endif
  51
  52 #include "project.h"
  53 #include "urlmatch.h"
  54 #include "ssplit.h"
  55 #include "miscutil.h"
  56 #include "errlog.h"
  57
  58 enum regex_anchoring
  59 {
  60    NO_ANCHORING,
  61    LEFT_ANCHORED,
  62    RIGHT_ANCHORED,
  63    RIGHT_ANCHORED_HOST
  64 };
  65 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  66
  67 /*********************************************************************
  68  *
  69  * Function    :  free_http_request
  70  *
  71  * Description :  Freez a http_request structure
  72  *
  73  * Parameters  :
  74  *          1  :  http = points to a http_request structure to free
  75  *
  76  * Returns     :  N/A
  77  *
  78  *********************************************************************/
  79 void free_http_request(struct http_request *http)
  80 {
  81    assert(http);
  82
  83    freez(http->cmd);
  84    freez(http->ocmd);
  85    freez(http->gpc);
  86    freez(http->host);
  87    freez(http->url);
  88    freez(http->hostport);
  89    freez(http->path);
  90    freez(http->version);
  91    freez(http->host_ip_addr_str);
  92 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  93    freez(http->dbuffer);
  94    freez(http->dvec);
  95    http->dcount = 0;
  96 #endif
  97 }
  98
  99
 100 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 101 /*********************************************************************
 102  *
 103  * Function    :  init_domain_components
 104  *
 105  * Description :  Splits the domain name so we can compare it
 106  *                against wildcards. It used to be part of
 107  *                parse_http_url, but was separated because the
 108  *                same code is required in chat in case of
 109  *                intercepted requests.
 110  *
 111  * Parameters  :
 112  *          1  :  http = pointer to the http structure to hold elements.
 113  *
 114  * Returns     :  JB_ERR_OK on success
 115  *                JB_ERR_PARSE on malformed command/URL
 116  *                             or >100 domains deep.
 117  *
 118  *********************************************************************/
 119 jb_err init_domain_components(struct http_request *http)
 120 {
 121    char *vec[BUFFER_SIZE];
 122    size_t size;
 123    char *p;
 124
 125    http->dbuffer = strdup_or_die(http->host);
 126
 127    /* map to lower case */
 128    for (p = http->dbuffer; *p ; p++)
 129    {
 130       *p = (char)privoxy_tolower(*p);
 131    }
 132
 133    /* split the domain name into components */
 134    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 135
 136    if (http->dcount <= 0)
 137    {
 138       /*
 139        * Error: More than SZ(vec) components in domain
 140        *    or: no components in domain
 141        */
 142       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 143       return JB_ERR_PARSE;
 144    }
 145
 146    /* save a copy of the pointers in dvec */
 147    size = (size_t)http->dcount * sizeof(*http->dvec);
 148
 149    http->dvec = malloc_or_die(size);
 150
 151    memcpy(http->dvec, vec, size);
 152
 153    return JB_ERR_OK;
 154 }
 155 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 156
 157
 158 /*********************************************************************
 159  *
 160  * Function    :  url_requires_percent_encoding
 161  *
 162  * Description :  Checks if an URL contains invalid characters
 163  *                according to RFC 3986 that should be percent-encoded.
 164  *                Does not verify whether or not the passed string
 165  *                actually is a valid URL.
 166  *
 167  * Parameters  :
 168  *          1  :  url = URL to check
 169  *
 170  * Returns     :  True in case of valid URLs, false otherwise
 171  *
 172  *********************************************************************/
 173 int url_requires_percent_encoding(const char *url)
 174 {
 175    static const char allowed_characters[128] = {
 176       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 177       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 178       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 179       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 180       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 181       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 182       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 183       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 184       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 185       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 186       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 187       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 188       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 189    };
 190
 191    while (*url != '\0')
 192    {
 193       const unsigned int i = (unsigned char)*url++;
 194       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 195       {
 196          return TRUE;
 197       }
 198    }
 199
 200    return FALSE;
 201
 202 }
 203
 204
 205 /*********************************************************************
 206  *
 207  * Function    :  parse_http_url
 208  *
 209  * Description :  Parse out the host and port from the URL.  Find the
 210  *                hostname & path, port (if ':'), and/or password (if '@')
 211  *
 212  * Parameters  :
 213  *          1  :  url = URL (or is it URI?) to break down
 214  *          2  :  http = pointer to the http structure to hold elements.
 215  *                       Must be initialized with valid values (like NULLs).
 216  *          3  :  require_protocol = Whether or not URLs without
 217  *                                   protocol are acceptable.
 218  *
 219  * Returns     :  JB_ERR_OK on success
 220  *                JB_ERR_PARSE on malformed command/URL
 221  *                             or >100 domains deep.
 222  *
 223  *********************************************************************/
 224 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 225 {
 226    int host_available = 1; /* A proxy can dream. */
 227
 228    /*
 229     * Save our initial URL
 230     */
 231    http->url = strdup_or_die(url);
 232
 233    /*
 234     * Check for * URI. If found, we're done.
 235     */
 236    if (*http->url == '*')
 237    {
 238       http->path = strdup_or_die("*");
 239       http->hostport = strdup_or_die("");
 240       if (http->url[1] != '\0')
 241       {
 242          return JB_ERR_PARSE;
 243       }
 244       return JB_ERR_OK;
 245    }
 246
 247
 248    /*
 249     * Split URL into protocol,hostport,path.
 250     */
 251    {
 252       char *buf;
 253       char *url_noproto;
 254       char *url_path;
 255
 256       buf = strdup_or_die(url);
 257
 258       /* Find the start of the URL in our scratch space */
 259       url_noproto = buf;
 260       if (strncmpic(url_noproto, "http://",  7) == 0)
 261       {
 262          url_noproto += 7;
 263       }
 264       else if (strncmpic(url_noproto, "https://", 8) == 0)
 265       {
 266          /*
 267           * Should only happen when called from cgi_show_url_info().
 268           */
 269          url_noproto += 8;
 270          http->ssl = 1;
 271       }
 272       else if (*url_noproto == '/')
 273       {
 274         /*
 275          * Short request line without protocol and host.
 276          * Most likely because the client's request
 277          * was intercepted and redirected into Privoxy.
 278          */
 279          http->host = NULL;
 280          host_available = 0;
 281       }
 282       else if (require_protocol)
 283       {
 284          freez(buf);
 285          return JB_ERR_PARSE;
 286       }
 287
 288       url_path = strchr(url_noproto, '/');
 289       if (url_path != NULL)
 290       {
 291          /*
 292           * Got a path.
 293           *
 294           * If FEATURE_HTTPS_INSPECTION isn't available, ignore the
 295           * path for https URLs so that we get consistent behaviour
 296           * if a https URL is parsed. When the URL is actually
 297           * retrieved, https hides the path part.
 298           */
 299          http->path = strdup_or_die(
 300 #ifndef FEATURE_HTTPS_INSPECTION
 301             http->ssl ? "/" :
 302 #endif
 303             url_path
 304          );
 305          *url_path = '\0';
 306          http->hostport = strdup_or_die(url_noproto);
 307       }
 308       else
 309       {
 310          /*
 311           * Repair broken HTTP requests that don't contain a path,
 312           * or CONNECT requests
 313           */
 314          http->path = strdup_or_die("/");
 315          http->hostport = strdup_or_die(url_noproto);
 316       }
 317
 318       freez(buf);
 319    }
 320
 321    if (!host_available)
 322    {
 323       /* Without host, there is nothing left to do here */
 324       return JB_ERR_OK;
 325    }
 326
 327    /*
 328     * Split hostport into user/password (ignored), host, port.
 329     */
 330    {
 331       char *buf;
 332       char *host;
 333       char *port;
 334
 335       buf = strdup_or_die(http->hostport);
 336
 337       /* check if url contains username and/or password */
 338       host = strchr(buf, '@');
 339       if (host != NULL)
 340       {
 341          /* Contains username/password, skip it and the @ sign. */
 342          host++;
 343       }
 344       else
 345       {
 346          /* No username or password. */
 347          host = buf;
 348       }
 349
 350       /* Move after hostname before port number */
 351       if (*host == '[')
 352       {
 353          /* Numeric IPv6 address delimited by brackets */
 354          host++;
 355          port = strchr(host, ']');
 356
 357          if (port == NULL)
 358          {
 359             /* Missing closing bracket */
 360             freez(buf);
 361             return JB_ERR_PARSE;
 362          }
 363
 364          *port++ = '\0';
 365
 366          if (*port == '\0')
 367          {
 368             port = NULL;
 369          }
 370          else if (*port != ':')
 371          {
 372             /* Garbage after closing bracket */
 373             freez(buf);
 374             return JB_ERR_PARSE;
 375          }
 376       }
 377       else
 378       {
 379          /* Plain non-escaped hostname */
 380          port = strchr(host, ':');
 381       }
 382
 383       /* check if url contains port */
 384       if (port != NULL)
 385       {
 386          /* Contains port */
 387          char *endptr;
 388          long parsed_port;
 389          /* Terminate hostname and point to start of port string */
 390          *port++ = '\0';
 391          parsed_port = strtol(port, &endptr, 10);
 392          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 393          {
 394             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 395             freez(buf);
 396             return JB_ERR_PARSE;
 397          }
 398          http->port = (int)parsed_port;
 399       }
 400       else
 401       {
 402          /* No port specified. */
 403          http->port = (http->ssl ? 443 : 80);
 404       }
 405
 406       http->host = strdup_or_die(host);
 407
 408       freez(buf);
 409    }
 410
 411 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 412    return JB_ERR_OK;
 413 #else
 414    /* Split domain name so we can compare it against wildcards */
 415    return init_domain_components(http);
 416 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 417
 418 }
 419
 420
 421 /*********************************************************************
 422  *
 423  * Function    :  unknown_method
 424  *
 425  * Description :  Checks whether a method is unknown.
 426  *
 427  * Parameters  :
 428  *          1  :  method = points to a http method
 429  *
 430  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 431  *
 432  *********************************************************************/
 433 static int unknown_method(const char *method)
 434 {
 435    static const char * const known_http_methods[] = {
 436       /* Basic HTTP request type */
 437       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 438       /* webDAV extensions (RFC2518) */
 439       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 440       /*
 441        * Microsoft webDAV extension for Exchange 2000.  See:
 442        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 443        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 444        */
 445       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 446       /*
 447        * Another Microsoft webDAV extension for Exchange 2000.  See:
 448        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 449        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 450        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 451        */
 452       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 453       /*
 454        * Yet another WebDAV extension, this time for
 455        * Web Distributed Authoring and Versioning (RFC3253)
 456        */
 457       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 458       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 459       /*
 460        * The PATCH method is defined by RFC5789, the format of the
 461        * actual patch in the body depends on the application, but from
 462        * Privoxy's point of view it doesn't matter.
 463        */
 464       "PATCH",
 465    };
 466    int i;
 467
 468    for (i = 0; i < SZ(known_http_methods); i++)
 469    {
 470       if (0 == strcmpic(method, known_http_methods[i]))
 471       {
 472          return FALSE;
 473       }
 474    }
 475
 476    return TRUE;
 477
 478 }
 479
 480
 481 /*********************************************************************
 482  *
 483  * Function    :  normalize_http_version
 484  *
 485  * Description :  Take a supported HTTP version string and remove
 486  *                leading zeroes etc., reject unsupported versions.
 487  *
 488  *                This is an explicit RFC 2616 (3.1) MUST and
 489  *                RFC 7230 mandates that intermediaries send their
 490  *                own HTTP-version in forwarded messages.
 491  *
 492  * Parameters  :
 493  *          1  :  http_version = HTTP version string
 494  *
 495  * Returns     :  JB_ERR_OK on success
 496  *                JB_ERR_PARSE if the HTTP version is unsupported
 497  *
 498  *********************************************************************/
 499 static jb_err normalize_http_version(char *http_version)
 500 {
 501    unsigned int major_version;
 502    unsigned int minor_version;
 503
 504    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 505    {
 506       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 507       return JB_ERR_PARSE;
 508    }
 509
 510    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 511    {
 512       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 513          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 514       return JB_ERR_PARSE;
 515    }
 516
 517    assert(strlen(http_version) >= 8);
 518    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 519
 520    return JB_ERR_OK;
 521
 522 }
 523
 524
 525 /*********************************************************************
 526  *
 527  * Function    :  parse_http_request
 528  *
 529  * Description :  Parse out the host and port from the URL.  Find the
 530  *                hostname & path, port (if ':'), and/or password (if '@')
 531  *
 532  * Parameters  :
 533  *          1  :  req = HTTP request line to break down
 534  *          2  :  http = pointer to the http structure to hold elements
 535  *
 536  * Returns     :  JB_ERR_OK on success
 537  *                JB_ERR_CGI_PARAMS on malformed command/URL
 538  *                                  or >100 domains deep.
 539  *
 540  *********************************************************************/
 541 jb_err parse_http_request(const char *req, struct http_request *http)
 542 {
 543    char *buf;
 544    char *v[3];
 545    int n;
 546    jb_err err;
 547
 548    memset(http, '\0', sizeof(*http));
 549
 550    buf = strdup_or_die(req);
 551
 552    n = ssplit(buf, " \r\n", v, SZ(v));
 553    if (n != 3)
 554    {
 555       freez(buf);
 556       return JB_ERR_PARSE;
 557    }
 558
 559    /*
 560     * Fail in case of unknown methods
 561     * which we might not handle correctly.
 562     *
 563     * XXX: There should be a config option
 564     * to forward requests with unknown methods
 565     * anyway. Most of them don't need special
 566     * steps.
 567     */
 568    if (unknown_method(v[0]))
 569    {
 570       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 571       freez(buf);
 572       return JB_ERR_PARSE;
 573    }
 574
 575    if (JB_ERR_OK != normalize_http_version(v[2]))
 576    {
 577       freez(buf);
 578       return JB_ERR_PARSE;
 579    }
 580
 581    http->ssl = !strcmpic(v[0], "CONNECT");
 582
 583    err = parse_http_url(v[1], http, !http->ssl);
 584    if (err)
 585    {
 586       freez(buf);
 587       return err;
 588    }
 589
 590    /*
 591     * Copy the details into the structure
 592     */
 593    http->cmd = strdup_or_die(req);
 594    http->gpc = strdup_or_die(v[0]);
 595    http->version = strdup_or_die(v[2]);
 596    http->ocmd = strdup_or_die(http->cmd);
 597
 598    freez(buf);
 599
 600    return JB_ERR_OK;
 601
 602 }
 603
 604
 605 /*********************************************************************
 606  *
 607  * Function    :  compile_pattern
 608  *
 609  * Description :  Compiles a host, domain or TAG pattern.
 610  *
 611  * Parameters  :
 612  *          1  :  pattern = The pattern to compile.
 613  *          2  :  anchoring = How the regex should be modified
 614  *                            before compilation. Can be either
 615  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 616  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 617  *          3  :  url     = In case of failures, the spec member is
 618  *                          logged and the structure freed.
 619  *          4  :  regex   = Where the compiled regex should be stored.
 620  *
 621  * Returns     :  JB_ERR_OK - Success
 622  *                JB_ERR_PARSE - Cannot parse regex
 623  *
 624  *********************************************************************/
 625 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 626                               struct pattern_spec *url, regex_t **regex)
 627 {
 628    int errcode;
 629    const char *fmt = NULL;
 630    char *rebuf;
 631    size_t rebuf_size;
 632
 633    assert(pattern);
 634
 635    if (pattern[0] == '\0')
 636    {
 637       *regex = NULL;
 638       return JB_ERR_OK;
 639    }
 640
 641    switch (anchoring)
 642    {
 643       case NO_ANCHORING:
 644          fmt = "%s";
 645          break;
 646       case RIGHT_ANCHORED:
 647          fmt = "%s$";
 648          break;
 649       case RIGHT_ANCHORED_HOST:
 650          fmt = "%s\\.?$";
 651          break;
 652       case LEFT_ANCHORED:
 653          fmt = "^%s";
 654          break;
 655       default:
 656          log_error(LOG_LEVEL_FATAL,
 657             "Invalid anchoring in compile_pattern %d", anchoring);
 658    }
 659    rebuf_size = strlen(pattern) + strlen(fmt);
 660    rebuf = malloc_or_die(rebuf_size);
 661    *regex = zalloc_or_die(sizeof(**regex));
 662
 663    snprintf(rebuf, rebuf_size, fmt, pattern);
 664
 665    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 666
 667    if (errcode)
 668    {
 669       size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
 670       if (errlen > (rebuf_size - (size_t)1))
 671       {
 672          errlen = rebuf_size - (size_t)1;
 673       }
 674       rebuf[errlen] = '\0';
 675       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 676          pattern, url->spec, rebuf);
 677       free_pattern_spec(url);
 678       freez(rebuf);
 679
 680       return JB_ERR_PARSE;
 681    }
 682    freez(rebuf);
 683
 684    return JB_ERR_OK;
 685
 686 }
 687
 688
 689 /*********************************************************************
 690  *
 691  * Function    :  compile_url_pattern
 692  *
 693  * Description :  Compiles the three parts of an URL pattern.
 694  *
 695  * Parameters  :
 696  *          1  :  url = Target pattern_spec to be filled in.
 697  *          2  :  buf = The url pattern to compile. Will be messed up.
 698  *
 699  * Returns     :  JB_ERR_OK - Success
 700  *                JB_ERR_MEMORY - Out of memory
 701  *                JB_ERR_PARSE - Cannot parse regex
 702  *
 703  *********************************************************************/
 704 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 705 {
 706    char *p;
 707
 708    p = strchr(buf, '/');
 709    if (NULL != p)
 710    {
 711       /*
 712        * Only compile the regex if it consists of more than
 713        * a single slash, otherwise it wouldn't affect the result.
 714        */
 715       if (p[1] != '\0')
 716       {
 717          /*
 718           * XXX: does it make sense to compile the slash at the beginning?
 719           */
 720          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 721
 722          if (JB_ERR_OK != err)
 723          {
 724             return err;
 725          }
 726       }
 727       *p = '\0';
 728    }
 729
 730    /*
 731     * IPv6 numeric hostnames can contain colons, thus we need
 732     * to delimit the hostname before the real port separator.
 733     * As brackets are already used in the hostname pattern,
 734     * we use angle brackets ('<', '>') instead.
 735     */
 736    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 737    {
 738       *p++ = '\0';
 739       buf++;
 740
 741       if (*p == '\0')
 742       {
 743          /* IPv6 address without port number */
 744          p = NULL;
 745       }
 746       else if (*p != ':')
 747       {
 748          /* Garbage after address delimiter */
 749          return JB_ERR_PARSE;
 750       }
 751    }
 752    else
 753    {
 754       p = strchr(buf, ':');
 755    }
 756
 757    if (NULL != p)
 758    {
 759       *p++ = '\0';
 760       url->pattern.url_spec.port_list = strdup_or_die(p);
 761    }
 762    else
 763    {
 764       url->pattern.url_spec.port_list = NULL;
 765    }
 766
 767    if (buf[0] != '\0')
 768    {
 769       return compile_host_pattern(url, buf);
 770    }
 771
 772    return JB_ERR_OK;
 773
 774 }
 775
 776
 777 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 778 /*********************************************************************
 779  *
 780  * Function    :  compile_host_pattern
 781  *
 782  * Description :  Parses and compiles a host pattern.
 783  *
 784  * Parameters  :
 785  *          1  :  url = Target pattern_spec to be filled in.
 786  *          2  :  host_pattern = Host pattern to compile.
 787  *
 788  * Returns     :  JB_ERR_OK - Success
 789  *                JB_ERR_MEMORY - Out of memory
 790  *                JB_ERR_PARSE - Cannot parse regex
 791  *
 792  *********************************************************************/
 793 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 794 {
 795    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 796 }
 797
 798 #else
 799
 800 /*********************************************************************
 801  *
 802  * Function    :  compile_host_pattern
 803  *
 804  * Description :  Parses and "compiles" an old-school host pattern.
 805  *
 806  * Parameters  :
 807  *          1  :  url = Target pattern_spec to be filled in.
 808  *          2  :  host_pattern = Host pattern to parse.
 809  *
 810  * Returns     :  JB_ERR_OK - Success
 811  *                JB_ERR_PARSE - Cannot parse regex
 812  *
 813  *********************************************************************/
 814 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 815 {
 816    char *v[150];
 817    size_t size;
 818    char *p;
 819
 820    /*
 821     * Parse domain part
 822     */
 823    if (host_pattern[strlen(host_pattern) - 1] == '.')
 824    {
 825       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 826    }
 827    if (host_pattern[0] == '.')
 828    {
 829       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 830    }
 831
 832    /*
 833     * Split domain into components
 834     */
 835    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 836
 837    /*
 838     * Map to lower case
 839     */
 840    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 841    {
 842       *p = (char)privoxy_tolower(*p);
 843    }
 844
 845    /*
 846     * Split the domain name into components
 847     */
 848    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 849
 850    if (url->pattern.url_spec.dcount < 0)
 851    {
 852       free_pattern_spec(url);
 853       return JB_ERR_PARSE;
 854    }
 855    else if (url->pattern.url_spec.dcount != 0)
 856    {
 857       /*
 858        * Save a copy of the pointers in dvec
 859        */
 860       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 861
 862       url->pattern.url_spec.dvec = malloc_or_die(size);
 863
 864       memcpy(url->pattern.url_spec.dvec, v, size);
 865    }
 866    /*
 867     * else dcount == 0 in which case we needn't do anything,
 868     * since dvec will never be accessed and the pattern will
 869     * match all domains.
 870     */
 871    return JB_ERR_OK;
 872 }
 873
 874
 875 /*********************************************************************
 876  *
 877  * Function    :  simplematch
 878  *
 879  * Description :  String matching, with a (greedy) '*' wildcard that
 880  *                stands for zero or more arbitrary characters and
 881  *                character classes in [], which take both enumerations
 882  *                and ranges.
 883  *
 884  * Parameters  :
 885  *          1  :  pattern = pattern for matching
 886  *          2  :  text    = text to be matched
 887  *
 888  * Returns     :  0 if match, else nonzero
 889  *
 890  *********************************************************************/
 891 static int simplematch(const char *pattern, const char *text)
 892 {
 893    const unsigned char *pat = (const unsigned char *)pattern;
 894    const unsigned char *txt = (const unsigned char *)text;
 895    const unsigned char *fallback = pat;
 896    int wildcard = 0;
 897
 898    unsigned char lastchar = 'a';
 899    unsigned i;
 900    unsigned char charmap[32];
 901
 902    while (*txt)
 903    {
 904
 905       /* EOF pattern but !EOF text? */
 906       if (*pat == '\0')
 907       {
 908          if (wildcard)
 909          {
 910             pat = fallback;
 911          }
 912          else
 913          {
 914             return 1;
 915          }
 916       }
 917
 918       /* '*' in the pattern?  */
 919       if (*pat == '*')
 920       {
 921
 922          /* The pattern ends afterwards? Speed up the return. */
 923          if (*++pat == '\0')
 924          {
 925             return 0;
 926          }
 927
 928          /* Else, set wildcard mode and remember position after '*' */
 929          wildcard = 1;
 930          fallback = pat;
 931       }
 932
 933       /* Character range specification? */
 934       if (*pat == '[')
 935       {
 936          memset(charmap, '\0', sizeof(charmap));
 937
 938          while (*++pat != ']')
 939          {
 940             if (!*pat)
 941             {
 942                return 1;
 943             }
 944             else if (*pat == '-')
 945             {
 946                if ((*++pat == ']') || *pat == '\0')
 947                {
 948                   return(1);
 949                }
 950                for (i = lastchar; i <= *pat; i++)
 951                {
 952                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 953                }
 954             }
 955             else
 956             {
 957                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 958                lastchar = *pat;
 959             }
 960          }
 961       } /* -END- if Character range specification */
 962
 963
 964       /*
 965        * Char match, or char range match?
 966        */
 967       if ((*pat == *txt)
 968        || (*pat == '?')
 969        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 970       {
 971          /*
 972           * Success: Go ahead
 973           */
 974          pat++;
 975       }
 976       else if (!wildcard)
 977       {
 978          /*
 979           * No match && no wildcard: No luck
 980           */
 981          return 1;
 982       }
 983       else if (pat != fallback)
 984       {
 985          /*
 986           * Increment text pointer if in char range matching
 987           */
 988          if (*pat == ']')
 989          {
 990             txt++;
 991          }
 992          /*
 993           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 994           */
 995          pat = fallback;
 996          /*
 997           * Restart matching from current text pointer
 998           */
 999          continue;
1000       }
1001       txt++;
1002    }
1003
1004    /* Cut off extra '*'s */
1005    if (*pat == '*') pat++;
1006
1007    /* If this is the pattern's end, fine! */
1008    return(*pat);
1009
1010 }
1011
1012
1013 /*********************************************************************
1014  *
1015  * Function    :  simple_domaincmp
1016  *
1017  * Description :  Domain-wise Compare fqdn's.  The comparison is
1018  *                both left- and right-anchored.  The individual
1019  *                domain names are compared with simplematch().
1020  *                This is only used by domain_match.
1021  *
1022  * Parameters  :
1023  *          1  :  pv = array of patterns to compare
1024  *          2  :  fv = array of domain components to compare
1025  *          3  :  len = length of the arrays (both arrays are the
1026  *                      same length - if they weren't, it couldn't
1027  *                      possibly be a match).
1028  *
1029  * Returns     :  0 => domains are equivalent, else no match.
1030  *
1031  *********************************************************************/
1032 static int simple_domaincmp(char **pv, char **fv, int len)
1033 {
1034    int n;
1035
1036    for (n = 0; n < len; n++)
1037    {
1038       if (simplematch(pv[n], fv[n]))
1039       {
1040          return 1;
1041       }
1042    }
1043
1044    return 0;
1045
1046 }
1047
1048
1049 /*********************************************************************
1050  *
1051  * Function    :  domain_match
1052  *
1053  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1054  *                p.pattern->unachored, the comparison is un-, left-,
1055  *                right-anchored, or both.
1056  *                The individual domain names are compared with
1057  *                simplematch().
1058  *
1059  * Parameters  :
1060  *          1  :  p = a domain that may contain a '*' as a wildcard.
1061  *          2  :  fqdn = domain name against which the patterns are compared.
1062  *
1063  * Returns     :  0 => domains are equivalent, else no match.
1064  *
1065  *********************************************************************/
1066 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1067 {
1068    char **pv, **fv;  /* vectors  */
1069    int    plen, flen;
1070    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1071
1072    plen = p->pattern.url_spec.dcount;
1073    flen = fqdn->dcount;
1074
1075    if (flen < plen)
1076    {
1077       /* fqdn is too short to match this pattern */
1078       return 1;
1079    }
1080
1081    pv   = p->pattern.url_spec.dvec;
1082    fv   = fqdn->dvec;
1083
1084    if (unanchored == ANCHOR_LEFT)
1085    {
1086       /*
1087        * Right anchored.
1088        *
1089        * Convert this into a fully anchored pattern with
1090        * the fqdn and pattern the same length
1091        */
1092       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1093       return simple_domaincmp(pv, fv, plen);
1094    }
1095    else if (unanchored == 0)
1096    {
1097       /* Fully anchored, check length */
1098       if (flen != plen)
1099       {
1100          return 1;
1101       }
1102       return simple_domaincmp(pv, fv, plen);
1103    }
1104    else if (unanchored == ANCHOR_RIGHT)
1105    {
1106       /* Left anchored, ignore all extra in fqdn */
1107       return simple_domaincmp(pv, fv, plen);
1108    }
1109    else
1110    {
1111       /* Unanchored */
1112       int n;
1113       int maxn = flen - plen;
1114       for (n = 0; n <= maxn; n++)
1115       {
1116          if (!simple_domaincmp(pv, fv, plen))
1117          {
1118             return 0;
1119          }
1120          /*
1121           * Doesn't match from start of fqdn
1122           * Try skipping first part of fqdn
1123           */
1124          fv++;
1125       }
1126       return 1;
1127    }
1128
1129 }
1130 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1131
1132
1133 /*********************************************************************
1134  *
1135  * Function    :  create_pattern_spec
1136  *
1137  * Description :  Creates a "pattern_spec" structure from a string.
1138  *                When finished, free with free_pattern_spec().
1139  *
1140  * Parameters  :
1141  *          1  :  pattern = Target pattern_spec to be filled in.
1142  *                          Will be zeroed before use.
1143  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1144  *                      contents of this buffer are destroyed by this
1145  *                      function.  If this function succeeds, the
1146  *                      buffer is copied to pattern->spec.  If this
1147  *                      function fails, the contents of the buffer
1148  *                      are lost forever.
1149  *
1150  * Returns     :  JB_ERR_OK - Success
1151  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1152  *                               written to system log)
1153  *
1154  *********************************************************************/
1155 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1156 {
1157    static const struct
1158    {
1159       /** The tag pattern prefix to match */
1160       const char *prefix;
1161
1162       /** The length of the prefix to match */
1163       const size_t prefix_length;
1164
1165       /** The pattern flag */
1166       const unsigned flag;
1167    } tag_pattern[] = {
1168       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1169  #ifdef FEATURE_CLIENT_TAGS
1170       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1171  #endif
1172       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1173       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1174    };
1175    int i;
1176
1177    assert(pattern);
1178    assert(buf);
1179
1180    memset(pattern, '\0', sizeof(*pattern));
1181
1182    /* Remember the original specification for the CGI pages. */
1183    pattern->spec = strdup_or_die(buf);
1184
1185    /* Check if it's a tag pattern */
1186    for (i = 0; i < SZ(tag_pattern); i++)
1187    {
1188       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1189       {
1190          /* The regex starts after the prefix */
1191          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1192
1193          pattern->flags |= tag_pattern[i].flag;
1194
1195          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1196             &pattern->pattern.tag_regex);
1197       }
1198    }
1199
1200    /* If it isn't a tag pattern it must be an URL pattern. */
1201    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1202
1203    return compile_url_pattern(pattern, buf);
1204
1205 }
1206
1207
1208 /*********************************************************************
1209  *
1210  * Function    :  free_pattern_spec
1211  *
1212  * Description :  Called from the "unloaders".  Freez the pattern
1213  *                structure elements.
1214  *
1215  * Parameters  :
1216  *          1  :  pattern = pointer to a pattern_spec structure.
1217  *
1218  * Returns     :  N/A
1219  *
1220  *********************************************************************/
1221 void free_pattern_spec(struct pattern_spec *pattern)
1222 {
1223    if (pattern == NULL) return;
1224
1225    freez(pattern->spec);
1226 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1227    if (pattern->pattern.url_spec.host_regex)
1228    {
1229       regfree(pattern->pattern.url_spec.host_regex);
1230       freez(pattern->pattern.url_spec.host_regex);
1231    }
1232 #else
1233    freez(pattern->pattern.url_spec.dbuffer);
1234    freez(pattern->pattern.url_spec.dvec);
1235    pattern->pattern.url_spec.dcount = 0;
1236 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1237    freez(pattern->pattern.url_spec.port_list);
1238    if (pattern->pattern.url_spec.preg)
1239    {
1240       regfree(pattern->pattern.url_spec.preg);
1241       freez(pattern->pattern.url_spec.preg);
1242    }
1243    if (pattern->pattern.tag_regex)
1244    {
1245       regfree(pattern->pattern.tag_regex);
1246       freez(pattern->pattern.tag_regex);
1247    }
1248 }
1249
1250
1251 /*********************************************************************
1252  *
1253  * Function    :  port_matches
1254  *
1255  * Description :  Compares a port against a port list.
1256  *
1257  * Parameters  :
1258  *          1  :  port      = The port to check.
1259  *          2  :  port_list = The list of port to compare with.
1260  *
1261  * Returns     :  TRUE for yes, FALSE otherwise.
1262  *
1263  *********************************************************************/
1264 static int port_matches(const int port, const char *port_list)
1265 {
1266    return ((NULL == port_list) || match_portlist(port_list, port));
1267 }
1268
1269
1270 /*********************************************************************
1271  *
1272  * Function    :  host_matches
1273  *
1274  * Description :  Compares a host against a host pattern.
1275  *
1276  * Parameters  :
1277  *          1  :  url = The URL to match
1278  *          2  :  pattern = The URL pattern
1279  *
1280  * Returns     :  TRUE for yes, FALSE otherwise.
1281  *
1282  *********************************************************************/
1283 static int host_matches(const struct http_request *http,
1284                         const struct pattern_spec *pattern)
1285 {
1286    assert(http->host != NULL);
1287 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1288    return ((NULL == pattern->pattern.url_spec.host_regex)
1289       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1290 #else
1291    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1292 #endif
1293 }
1294
1295
1296 /*********************************************************************
1297  *
1298  * Function    :  path_matches
1299  *
1300  * Description :  Compares a path against a path pattern.
1301  *
1302  * Parameters  :
1303  *          1  :  path = The path to match
1304  *          2  :  pattern = The URL pattern
1305  *
1306  * Returns     :  TRUE for yes, FALSE otherwise.
1307  *
1308  *********************************************************************/
1309 static int path_matches(const char *path, const struct pattern_spec *pattern)
1310 {
1311    return ((NULL == pattern->pattern.url_spec.preg)
1312       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1313 }
1314
1315
1316 /*********************************************************************
1317  *
1318  * Function    :  url_match
1319  *
1320  * Description :  Compare a URL against a URL pattern.
1321  *
1322  * Parameters  :
1323  *          1  :  pattern = a URL pattern
1324  *          2  :  url = URL to match
1325  *
1326  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1327  *
1328  *********************************************************************/
1329 int url_match(const struct pattern_spec *pattern,
1330               const struct http_request *http)
1331 {
1332    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1333    {
1334       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1335       return 0;
1336    }
1337
1338    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1339       && host_matches(http, pattern) && path_matches(http->path, pattern));
1340
1341 }
1342
1343
1344 /*********************************************************************
1345  *
1346  * Function    :  match_portlist
1347  *
1348  * Description :  Check if a given number is covered by a comma
1349  *                separated list of numbers and ranges (a,b-c,d,..)
1350  *
1351  * Parameters  :
1352  *          1  :  portlist = String with list
1353  *          2  :  port = port to check
1354  *
1355  * Returns     :  0 => no match
1356  *                1 => match
1357  *
1358  *********************************************************************/
1359 int match_portlist(const char *portlist, int port)
1360 {
1361    char *min, *max, *next, *portlist_copy;
1362
1363    min = portlist_copy = strdup_or_die(portlist);
1364
1365    /*
1366     * Zero-terminate first item and remember offset for next
1367     */
1368    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1369    {
1370       *next++ = '\0';
1371    }
1372
1373    /*
1374     * Loop through all items, checking for match
1375     */
1376    while (NULL != min)
1377    {
1378       if (NULL == (max = strchr(min, (int) '-')))
1379       {
1380          /*
1381           * No dash, check for equality
1382           */
1383          if (port == atoi(min))
1384          {
1385             freez(portlist_copy);
1386             return(1);
1387          }
1388       }
1389       else
1390       {
1391          /*
1392           * This is a range, so check if between min and max,
1393           * or, if max was omitted, between min and 65K
1394           */
1395          *max++ = '\0';
1396          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1397          {
1398             freez(portlist_copy);
1399             return(1);
1400          }
1401
1402       }
1403
1404       /*
1405        * Jump to next item
1406        */
1407       min = next;
1408
1409       /*
1410        * Zero-terminate next item and remember offset for n+1
1411        */
1412       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1413       {
1414          *next++ = '\0';
1415       }
1416    }
1417
1418    freez(portlist_copy);
1419    return 0;
1420
1421 }
1422
1423
1424 /*********************************************************************
1425  *
1426  * Function    :  parse_forwarder_address
1427  *
1428  * Description :  Parse out the username, password, host and port from
1429  *                a forwarder address.
1430  *
1431  * Parameters  :
1432  *          1  :  address = The forwarder address to parse.
1433  *          2  :  hostname = Used to return the hostname. NULL on error.
1434  *          3  :  port = Used to return the port. Untouched if no port
1435  *                       is specified.
1436  *          4  :  username = Used to return the username if any.
1437  *          5  :  password = Used to return the password if any.
1438  *
1439  * Returns     :  JB_ERR_OK on success
1440  *                JB_ERR_MEMORY on out of memory
1441  *                JB_ERR_PARSE on malformed address.
1442  *
1443  *********************************************************************/
1444 jb_err parse_forwarder_address(char *address, char **hostname, int *port,
1445                                char **username, char **password)
1446 {
1447    char *p;
1448    char *tmp;
1449
1450    tmp = *hostname = strdup_or_die(address);
1451
1452    /* Parse username and password */
1453    if (username && password && (NULL != (p = strchr(*hostname, '@'))))
1454    {
1455       *p++ = '\0';
1456       *username = strdup_or_die(*hostname);
1457       *hostname = strdup_or_die(p);
1458
1459       if (NULL != (p = strchr(*username, ':')))
1460       {
1461          *p++ = '\0';
1462          *password = strdup_or_die(p);
1463       }
1464       freez(tmp);
1465    }
1466
1467    /* Parse hostname and port */
1468    p = *hostname;
1469    if ((*p == '[') && (NULL == strchr(p, ']')))
1470    {
1471       /* XXX: Should do some more validity checks here. */
1472       return JB_ERR_PARSE;
1473    }
1474
1475    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1476    {
1477       *p++ = '\0';
1478       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1479       if (*p == ':')
1480       {
1481          *port = (int)strtol(++p, NULL, 0);
1482       }
1483    }
1484    else if (NULL != (p = strchr(*hostname, ':')))
1485    {
1486       *p++ = '\0';
1487       *port = (int)strtol(p, NULL, 0);
1488    }
1489
1490    return JB_ERR_OK;
1491
1492 }
1493
1494
1495 /*
1496   Local Variables:
1497   tab-width: 3
1498   end:
1499 */