urlmatch.c

   1 /*********************************************************************
   2  *
   3  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   4  *
   5  * Purpose     :  Declares functions to match URLs against URL
   6  *                patterns.
   7  *
   8  * Copyright   :  Written by and Copyright (C) 2001-2014
   9  *                the Privoxy team. http://www.privoxy.org/
  10  *
  11  *                Based on the Internet Junkbuster originally written
  12  *                by and Copyright (C) 1997 Anonymous Coders and
  13  *                Junkbusters Corporation.  http://www.junkbusters.com
  14  *
  15  *                This program is free software; you can redistribute it
  16  *                and/or modify it under the terms of the GNU General
  17  *                Public License as published by the Free Software
  18  *                Foundation; either version 2 of the License, or (at
  19  *                your option) any later version.
  20  *
  21  *                This program is distributed in the hope that it will
  22  *                be useful, but WITHOUT ANY WARRANTY; without even the
  23  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  24  *                PARTICULAR PURPOSE.  See the GNU General Public
  25  *                License for more details.
  26  *
  27  *                The GNU General Public License should be included with
  28  *                this file.  If not, you can view it at
  29  *                http://www.gnu.org/copyleft/gpl.html
  30  *                or write to the Free Software Foundation, Inc., 59
  31  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32  *
  33  *********************************************************************/
  34
  35
  36 #include "config.h"
  37
  38 #ifndef _WIN32
  39 #include <stdio.h>
  40 #include <sys/types.h>
  41 #endif
  42
  43 #include <stdlib.h>
  44 #include <ctype.h>
  45 #include <assert.h>
  46 #include <string.h>
  47
  48 #if !defined(_WIN32) && !defined(__OS2__)
  49 #include <unistd.h>
  50 #endif
  51
  52 #include "project.h"
  53 #include "urlmatch.h"
  54 #include "ssplit.h"
  55 #include "miscutil.h"
  56 #include "errlog.h"
  57
  58 enum regex_anchoring
  59 {
  60    NO_ANCHORING,
  61    LEFT_ANCHORED,
  62    RIGHT_ANCHORED,
  63    RIGHT_ANCHORED_HOST
  64 };
  65 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  66
  67 /*********************************************************************
  68  *
  69  * Function    :  free_http_request
  70  *
  71  * Description :  Freez a http_request structure
  72  *
  73  * Parameters  :
  74  *          1  :  http = points to a http_request structure to free
  75  *
  76  * Returns     :  N/A
  77  *
  78  *********************************************************************/
  79 void free_http_request(struct http_request *http)
  80 {
  81    assert(http);
  82
  83    freez(http->cmd);
  84    freez(http->ocmd);
  85    freez(http->gpc);
  86    freez(http->host);
  87    freez(http->url);
  88    freez(http->hostport);
  89    freez(http->path);
  90    freez(http->ver);
  91    freez(http->host_ip_addr_str);
  92 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  93    freez(http->dbuffer);
  94    freez(http->dvec);
  95    http->dcount = 0;
  96 #endif
  97 }
  98
  99
 100 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 101 /*********************************************************************
 102  *
 103  * Function    :  init_domain_components
 104  *
 105  * Description :  Splits the domain name so we can compare it
 106  *                against wildcards. It used to be part of
 107  *                parse_http_url, but was separated because the
 108  *                same code is required in chat in case of
 109  *                intercepted requests.
 110  *
 111  * Parameters  :
 112  *          1  :  http = pointer to the http structure to hold elements.
 113  *
 114  * Returns     :  JB_ERR_OK on success
 115  *                JB_ERR_PARSE on malformed command/URL
 116  *                             or >100 domains deep.
 117  *
 118  *********************************************************************/
 119 jb_err init_domain_components(struct http_request *http)
 120 {
 121    char *vec[BUFFER_SIZE];
 122    size_t size;
 123    char *p;
 124
 125    http->dbuffer = strdup_or_die(http->host);
 126
 127    /* map to lower case */
 128    for (p = http->dbuffer; *p ; p++)
 129    {
 130       *p = (char)privoxy_tolower(*p);
 131    }
 132
 133    /* split the domain name into components */
 134    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 135
 136    if (http->dcount <= 0)
 137    {
 138       /*
 139        * Error: More than SZ(vec) components in domain
 140        *    or: no components in domain
 141        */
 142       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 143       return JB_ERR_PARSE;
 144    }
 145
 146    /* save a copy of the pointers in dvec */
 147    size = (size_t)http->dcount * sizeof(*http->dvec);
 148
 149    http->dvec = malloc_or_die(size);
 150
 151    memcpy(http->dvec, vec, size);
 152
 153    return JB_ERR_OK;
 154 }
 155 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 156
 157
 158 /*********************************************************************
 159  *
 160  * Function    :  url_requires_percent_encoding
 161  *
 162  * Description :  Checks if an URL contains invalid characters
 163  *                according to RFC 3986 that should be percent-encoded.
 164  *                Does not verify whether or not the passed string
 165  *                actually is a valid URL.
 166  *
 167  * Parameters  :
 168  *          1  :  url = URL to check
 169  *
 170  * Returns     :  True in case of valid URLs, false otherwise
 171  *
 172  *********************************************************************/
 173 int url_requires_percent_encoding(const char *url)
 174 {
 175    static const char allowed_characters[128] = {
 176       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 177       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 178       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 179       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 180       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 181       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 182       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 183       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 184       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 185       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 186       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 187       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 188       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 189    };
 190
 191    while (*url != '\0')
 192    {
 193       const unsigned int i = (unsigned char)*url++;
 194       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 195       {
 196          return TRUE;
 197       }
 198    }
 199
 200    return FALSE;
 201
 202 }
 203
 204
 205 /*********************************************************************
 206  *
 207  * Function    :  parse_http_url
 208  *
 209  * Description :  Parse out the host and port from the URL.  Find the
 210  *                hostname & path, port (if ':'), and/or password (if '@')
 211  *
 212  * Parameters  :
 213  *          1  :  url = URL (or is it URI?) to break down
 214  *          2  :  http = pointer to the http structure to hold elements.
 215  *                       Must be initialized with valid values (like NULLs).
 216  *          3  :  require_protocol = Whether or not URLs without
 217  *                                   protocol are acceptable.
 218  *
 219  * Returns     :  JB_ERR_OK on success
 220  *                JB_ERR_PARSE on malformed command/URL
 221  *                             or >100 domains deep.
 222  *
 223  *********************************************************************/
 224 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 225 {
 226    int host_available = 1; /* A proxy can dream. */
 227
 228    /*
 229     * Save our initial URL
 230     */
 231    http->url = strdup_or_die(url);
 232
 233    /*
 234     * Check for * URI. If found, we're done.
 235     */
 236    if (*http->url == '*')
 237    {
 238       http->path = strdup_or_die("*");
 239       http->hostport = strdup_or_die("");
 240       if (http->url[1] != '\0')
 241       {
 242          return JB_ERR_PARSE;
 243       }
 244       return JB_ERR_OK;
 245    }
 246
 247
 248    /*
 249     * Split URL into protocol,hostport,path.
 250     */
 251    {
 252       char *buf;
 253       char *url_noproto;
 254       char *url_path;
 255
 256       buf = strdup_or_die(url);
 257
 258       /* Find the start of the URL in our scratch space */
 259       url_noproto = buf;
 260       if (strncmpic(url_noproto, "http://",  7) == 0)
 261       {
 262          url_noproto += 7;
 263       }
 264       else if (strncmpic(url_noproto, "https://", 8) == 0)
 265       {
 266          /*
 267           * Should only happen when called from cgi_show_url_info().
 268           */
 269          url_noproto += 8;
 270          http->ssl = 1;
 271       }
 272       else if (*url_noproto == '/')
 273       {
 274         /*
 275          * Short request line without protocol and host.
 276          * Most likely because the client's request
 277          * was intercepted and redirected into Privoxy.
 278          */
 279          http->host = NULL;
 280          host_available = 0;
 281       }
 282       else if (require_protocol)
 283       {
 284          freez(buf);
 285          return JB_ERR_PARSE;
 286       }
 287
 288       url_path = strchr(url_noproto, '/');
 289       if (url_path != NULL)
 290       {
 291          /*
 292           * Got a path.
 293           *
 294           * NOTE: The following line ignores the path for HTTPS URLS.
 295           * This means that you get consistent behaviour if you type a
 296           * https URL in and it's parsed by the function.  (When the
 297           * URL is actually retrieved, SSL hides the path part).
 298           */
 299          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 300          *url_path = '\0';
 301          http->hostport = strdup_or_die(url_noproto);
 302       }
 303       else
 304       {
 305          /*
 306           * Repair broken HTTP requests that don't contain a path,
 307           * or CONNECT requests
 308           */
 309          http->path = strdup_or_die("/");
 310          http->hostport = strdup_or_die(url_noproto);
 311       }
 312
 313       freez(buf);
 314    }
 315
 316    if (!host_available)
 317    {
 318       /* Without host, there is nothing left to do here */
 319       return JB_ERR_OK;
 320    }
 321
 322    /*
 323     * Split hostport into user/password (ignored), host, port.
 324     */
 325    {
 326       char *buf;
 327       char *host;
 328       char *port;
 329
 330       buf = strdup_or_die(http->hostport);
 331
 332       /* check if url contains username and/or password */
 333       host = strchr(buf, '@');
 334       if (host != NULL)
 335       {
 336          /* Contains username/password, skip it and the @ sign. */
 337          host++;
 338       }
 339       else
 340       {
 341          /* No username or password. */
 342          host = buf;
 343       }
 344
 345       /* Move after hostname before port number */
 346       if (*host == '[')
 347       {
 348          /* Numeric IPv6 address delimited by brackets */
 349          host++;
 350          port = strchr(host, ']');
 351
 352          if (port == NULL)
 353          {
 354             /* Missing closing bracket */
 355             freez(buf);
 356             return JB_ERR_PARSE;
 357          }
 358
 359          *port++ = '\0';
 360
 361          if (*port == '\0')
 362          {
 363             port = NULL;
 364          }
 365          else if (*port != ':')
 366          {
 367             /* Garbage after closing bracket */
 368             freez(buf);
 369             return JB_ERR_PARSE;
 370          }
 371       }
 372       else
 373       {
 374          /* Plain non-escaped hostname */
 375          port = strchr(host, ':');
 376       }
 377
 378       /* check if url contains port */
 379       if (port != NULL)
 380       {
 381          /* Contains port */
 382          char *endptr;
 383          long parsed_port;
 384          /* Terminate hostname and point to start of port string */
 385          *port++ = '\0';
 386          parsed_port = strtol(port, &endptr, 10);
 387          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 388          {
 389             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 390             freez(buf);
 391             return JB_ERR_PARSE;
 392          }
 393          http->port = (int)parsed_port;
 394       }
 395       else
 396       {
 397          /* No port specified. */
 398          http->port = (http->ssl ? 443 : 80);
 399       }
 400
 401       http->host = strdup_or_die(host);
 402
 403       freez(buf);
 404    }
 405
 406 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 407    return JB_ERR_OK;
 408 #else
 409    /* Split domain name so we can compare it against wildcards */
 410    return init_domain_components(http);
 411 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 412
 413 }
 414
 415
 416 /*********************************************************************
 417  *
 418  * Function    :  unknown_method
 419  *
 420  * Description :  Checks whether a method is unknown.
 421  *
 422  * Parameters  :
 423  *          1  :  method = points to a http method
 424  *
 425  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 426  *
 427  *********************************************************************/
 428 static int unknown_method(const char *method)
 429 {
 430    static const char * const known_http_methods[] = {
 431       /* Basic HTTP request type */
 432       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 433       /* webDAV extensions (RFC2518) */
 434       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 435       /*
 436        * Microsoft webDAV extension for Exchange 2000.  See:
 437        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 438        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 439        */
 440       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 441       /*
 442        * Another Microsoft webDAV extension for Exchange 2000.  See:
 443        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 444        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 445        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 446        */
 447       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 448       /*
 449        * Yet another WebDAV extension, this time for
 450        * Web Distributed Authoring and Versioning (RFC3253)
 451        */
 452       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 453       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 454       /*
 455        * The PATCH method is defined by RFC5789, the format of the
 456        * actual patch in the body depends on the application, but from
 457        * Privoxy's point of view it doesn't matter.
 458        */
 459       "PATCH",
 460    };
 461    int i;
 462
 463    for (i = 0; i < SZ(known_http_methods); i++)
 464    {
 465       if (0 == strcmpic(method, known_http_methods[i]))
 466       {
 467          return FALSE;
 468       }
 469    }
 470
 471    return TRUE;
 472
 473 }
 474
 475
 476 /*********************************************************************
 477  *
 478  * Function    :  normalize_http_version
 479  *
 480  * Description :  Take a supported HTTP version string and remove
 481  *                leading zeroes etc., reject unsupported versions.
 482  *
 483  *                This is an explicit RFC 2616 (3.1) MUST and
 484  *                RFC 7230 mandates that intermediaries send their
 485  *                own HTTP-version in forwarded messages.
 486  *
 487  * Parameters  :
 488  *          1  :  http_version = HTTP version string
 489  *
 490  * Returns     :  JB_ERR_OK on success
 491  *                JB_ERR_PARSE if the HTTP version is unsupported
 492  *
 493  *********************************************************************/
 494 static jb_err normalize_http_version(char *http_version)
 495 {
 496    unsigned int major_version;
 497    unsigned int minor_version;
 498
 499    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 500    {
 501       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 502       return JB_ERR_PARSE;
 503    }
 504
 505    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 506    {
 507       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 508          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 509       return JB_ERR_PARSE;
 510    }
 511
 512    assert(strlen(http_version) >= 8);
 513    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 514
 515    return JB_ERR_OK;
 516
 517 }
 518
 519
 520 /*********************************************************************
 521  *
 522  * Function    :  parse_http_request
 523  *
 524  * Description :  Parse out the host and port from the URL.  Find the
 525  *                hostname & path, port (if ':'), and/or password (if '@')
 526  *
 527  * Parameters  :
 528  *          1  :  req = HTTP request line to break down
 529  *          2  :  http = pointer to the http structure to hold elements
 530  *
 531  * Returns     :  JB_ERR_OK on success
 532  *                JB_ERR_CGI_PARAMS on malformed command/URL
 533  *                                  or >100 domains deep.
 534  *
 535  *********************************************************************/
 536 jb_err parse_http_request(const char *req, struct http_request *http)
 537 {
 538    char *buf;
 539    char *v[3];
 540    int n;
 541    jb_err err;
 542
 543    memset(http, '\0', sizeof(*http));
 544
 545    buf = strdup_or_die(req);
 546
 547    n = ssplit(buf, " \r\n", v, SZ(v));
 548    if (n != 3)
 549    {
 550       freez(buf);
 551       return JB_ERR_PARSE;
 552    }
 553
 554    /*
 555     * Fail in case of unknown methods
 556     * which we might not handle correctly.
 557     *
 558     * XXX: There should be a config option
 559     * to forward requests with unknown methods
 560     * anyway. Most of them don't need special
 561     * steps.
 562     */
 563    if (unknown_method(v[0]))
 564    {
 565       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 566       freez(buf);
 567       return JB_ERR_PARSE;
 568    }
 569
 570    if (JB_ERR_OK != normalize_http_version(v[2]))
 571    {
 572       freez(buf);
 573       return JB_ERR_PARSE;
 574    }
 575
 576    http->ssl = !strcmpic(v[0], "CONNECT");
 577
 578    err = parse_http_url(v[1], http, !http->ssl);
 579    if (err)
 580    {
 581       freez(buf);
 582       return err;
 583    }
 584
 585    /*
 586     * Copy the details into the structure
 587     */
 588    http->cmd = strdup_or_die(req);
 589    http->gpc = strdup_or_die(v[0]);
 590    http->ver = strdup_or_die(v[2]);
 591    http->ocmd = strdup_or_die(http->cmd);
 592
 593    freez(buf);
 594
 595    return JB_ERR_OK;
 596
 597 }
 598
 599
 600 /*********************************************************************
 601  *
 602  * Function    :  compile_pattern
 603  *
 604  * Description :  Compiles a host, domain or TAG pattern.
 605  *
 606  * Parameters  :
 607  *          1  :  pattern = The pattern to compile.
 608  *          2  :  anchoring = How the regex should be modified
 609  *                            before compilation. Can be either
 610  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 611  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 612  *          3  :  url     = In case of failures, the spec member is
 613  *                          logged and the structure freed.
 614  *          4  :  regex   = Where the compiled regex should be stored.
 615  *
 616  * Returns     :  JB_ERR_OK - Success
 617  *                JB_ERR_PARSE - Cannot parse regex
 618  *
 619  *********************************************************************/
 620 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 621                               struct pattern_spec *url, regex_t **regex)
 622 {
 623    int errcode;
 624    const char *fmt = NULL;
 625    char *rebuf;
 626    size_t rebuf_size;
 627
 628    assert(pattern);
 629
 630    if (pattern[0] == '\0')
 631    {
 632       *regex = NULL;
 633       return JB_ERR_OK;
 634    }
 635
 636    switch (anchoring)
 637    {
 638       case NO_ANCHORING:
 639          fmt = "%s";
 640          break;
 641       case RIGHT_ANCHORED:
 642          fmt = "%s$";
 643          break;
 644       case RIGHT_ANCHORED_HOST:
 645          fmt = "%s\\.?$";
 646          break;
 647       case LEFT_ANCHORED:
 648          fmt = "^%s";
 649          break;
 650       default:
 651          log_error(LOG_LEVEL_FATAL,
 652             "Invalid anchoring in compile_pattern %d", anchoring);
 653    }
 654    rebuf_size = strlen(pattern) + strlen(fmt);
 655    rebuf = malloc_or_die(rebuf_size);
 656    *regex = zalloc_or_die(sizeof(**regex));
 657
 658    snprintf(rebuf, rebuf_size, fmt, pattern);
 659
 660    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 661
 662    if (errcode)
 663    {
 664       size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
 665       if (errlen > (rebuf_size - (size_t)1))
 666       {
 667          errlen = rebuf_size - (size_t)1;
 668       }
 669       rebuf[errlen] = '\0';
 670       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 671          pattern, url->spec, rebuf);
 672       free_pattern_spec(url);
 673       freez(rebuf);
 674
 675       return JB_ERR_PARSE;
 676    }
 677    freez(rebuf);
 678
 679    return JB_ERR_OK;
 680
 681 }
 682
 683
 684 /*********************************************************************
 685  *
 686  * Function    :  compile_url_pattern
 687  *
 688  * Description :  Compiles the three parts of an URL pattern.
 689  *
 690  * Parameters  :
 691  *          1  :  url = Target pattern_spec to be filled in.
 692  *          2  :  buf = The url pattern to compile. Will be messed up.
 693  *
 694  * Returns     :  JB_ERR_OK - Success
 695  *                JB_ERR_MEMORY - Out of memory
 696  *                JB_ERR_PARSE - Cannot parse regex
 697  *
 698  *********************************************************************/
 699 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 700 {
 701    char *p;
 702
 703    p = strchr(buf, '/');
 704    if (NULL != p)
 705    {
 706       /*
 707        * Only compile the regex if it consists of more than
 708        * a single slash, otherwise it wouldn't affect the result.
 709        */
 710       if (p[1] != '\0')
 711       {
 712          /*
 713           * XXX: does it make sense to compile the slash at the beginning?
 714           */
 715          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 716
 717          if (JB_ERR_OK != err)
 718          {
 719             return err;
 720          }
 721       }
 722       *p = '\0';
 723    }
 724
 725    /*
 726     * IPv6 numeric hostnames can contain colons, thus we need
 727     * to delimit the hostname before the real port separator.
 728     * As brackets are already used in the hostname pattern,
 729     * we use angle brackets ('<', '>') instead.
 730     */
 731    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 732    {
 733       *p++ = '\0';
 734       buf++;
 735
 736       if (*p == '\0')
 737       {
 738          /* IPv6 address without port number */
 739          p = NULL;
 740       }
 741       else if (*p != ':')
 742       {
 743          /* Garbage after address delimiter */
 744          return JB_ERR_PARSE;
 745       }
 746    }
 747    else
 748    {
 749       p = strchr(buf, ':');
 750    }
 751
 752    if (NULL != p)
 753    {
 754       *p++ = '\0';
 755       url->pattern.url_spec.port_list = strdup_or_die(p);
 756    }
 757    else
 758    {
 759       url->pattern.url_spec.port_list = NULL;
 760    }
 761
 762    if (buf[0] != '\0')
 763    {
 764       return compile_host_pattern(url, buf);
 765    }
 766
 767    return JB_ERR_OK;
 768
 769 }
 770
 771
 772 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 773 /*********************************************************************
 774  *
 775  * Function    :  compile_host_pattern
 776  *
 777  * Description :  Parses and compiles a host pattern.
 778  *
 779  * Parameters  :
 780  *          1  :  url = Target pattern_spec to be filled in.
 781  *          2  :  host_pattern = Host pattern to compile.
 782  *
 783  * Returns     :  JB_ERR_OK - Success
 784  *                JB_ERR_MEMORY - Out of memory
 785  *                JB_ERR_PARSE - Cannot parse regex
 786  *
 787  *********************************************************************/
 788 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 789 {
 790    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 791 }
 792
 793 #else
 794
 795 /*********************************************************************
 796  *
 797  * Function    :  compile_host_pattern
 798  *
 799  * Description :  Parses and "compiles" an old-school host pattern.
 800  *
 801  * Parameters  :
 802  *          1  :  url = Target pattern_spec to be filled in.
 803  *          2  :  host_pattern = Host pattern to parse.
 804  *
 805  * Returns     :  JB_ERR_OK - Success
 806  *                JB_ERR_PARSE - Cannot parse regex
 807  *
 808  *********************************************************************/
 809 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 810 {
 811    char *v[150];
 812    size_t size;
 813    char *p;
 814
 815    /*
 816     * Parse domain part
 817     */
 818    if (host_pattern[strlen(host_pattern) - 1] == '.')
 819    {
 820       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 821    }
 822    if (host_pattern[0] == '.')
 823    {
 824       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 825    }
 826
 827    /*
 828     * Split domain into components
 829     */
 830    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 831
 832    /*
 833     * Map to lower case
 834     */
 835    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 836    {
 837       *p = (char)privoxy_tolower(*p);
 838    }
 839
 840    /*
 841     * Split the domain name into components
 842     */
 843    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 844
 845    if (url->pattern.url_spec.dcount < 0)
 846    {
 847       free_pattern_spec(url);
 848       return JB_ERR_PARSE;
 849    }
 850    else if (url->pattern.url_spec.dcount != 0)
 851    {
 852       /*
 853        * Save a copy of the pointers in dvec
 854        */
 855       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 856
 857       url->pattern.url_spec.dvec = malloc_or_die(size);
 858
 859       memcpy(url->pattern.url_spec.dvec, v, size);
 860    }
 861    /*
 862     * else dcount == 0 in which case we needn't do anything,
 863     * since dvec will never be accessed and the pattern will
 864     * match all domains.
 865     */
 866    return JB_ERR_OK;
 867 }
 868
 869
 870 /*********************************************************************
 871  *
 872  * Function    :  simplematch
 873  *
 874  * Description :  String matching, with a (greedy) '*' wildcard that
 875  *                stands for zero or more arbitrary characters and
 876  *                character classes in [], which take both enumerations
 877  *                and ranges.
 878  *
 879  * Parameters  :
 880  *          1  :  pattern = pattern for matching
 881  *          2  :  text    = text to be matched
 882  *
 883  * Returns     :  0 if match, else nonzero
 884  *
 885  *********************************************************************/
 886 static int simplematch(const char *pattern, const char *text)
 887 {
 888    const unsigned char *pat = (const unsigned char *)pattern;
 889    const unsigned char *txt = (const unsigned char *)text;
 890    const unsigned char *fallback = pat;
 891    int wildcard = 0;
 892
 893    unsigned char lastchar = 'a';
 894    unsigned i;
 895    unsigned char charmap[32];
 896
 897    while (*txt)
 898    {
 899
 900       /* EOF pattern but !EOF text? */
 901       if (*pat == '\0')
 902       {
 903          if (wildcard)
 904          {
 905             pat = fallback;
 906          }
 907          else
 908          {
 909             return 1;
 910          }
 911       }
 912
 913       /* '*' in the pattern?  */
 914       if (*pat == '*')
 915       {
 916
 917          /* The pattern ends afterwards? Speed up the return. */
 918          if (*++pat == '\0')
 919          {
 920             return 0;
 921          }
 922
 923          /* Else, set wildcard mode and remember position after '*' */
 924          wildcard = 1;
 925          fallback = pat;
 926       }
 927
 928       /* Character range specification? */
 929       if (*pat == '[')
 930       {
 931          memset(charmap, '\0', sizeof(charmap));
 932
 933          while (*++pat != ']')
 934          {
 935             if (!*pat)
 936             {
 937                return 1;
 938             }
 939             else if (*pat == '-')
 940             {
 941                if ((*++pat == ']') || *pat == '\0')
 942                {
 943                   return(1);
 944                }
 945                for (i = lastchar; i <= *pat; i++)
 946                {
 947                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 948                }
 949             }
 950             else
 951             {
 952                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 953                lastchar = *pat;
 954             }
 955          }
 956       } /* -END- if Character range specification */
 957
 958
 959       /*
 960        * Char match, or char range match?
 961        */
 962       if ((*pat == *txt)
 963        || (*pat == '?')
 964        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 965       {
 966          /*
 967           * Success: Go ahead
 968           */
 969          pat++;
 970       }
 971       else if (!wildcard)
 972       {
 973          /*
 974           * No match && no wildcard: No luck
 975           */
 976          return 1;
 977       }
 978       else if (pat != fallback)
 979       {
 980          /*
 981           * Increment text pointer if in char range matching
 982           */
 983          if (*pat == ']')
 984          {
 985             txt++;
 986          }
 987          /*
 988           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 989           */
 990          pat = fallback;
 991          /*
 992           * Restart matching from current text pointer
 993           */
 994          continue;
 995       }
 996       txt++;
 997    }
 998
 999    /* Cut off extra '*'s */
1000    if (*pat == '*') pat++;
1001
1002    /* If this is the pattern's end, fine! */
1003    return(*pat);
1004
1005 }
1006
1007
1008 /*********************************************************************
1009  *
1010  * Function    :  simple_domaincmp
1011  *
1012  * Description :  Domain-wise Compare fqdn's.  The comparison is
1013  *                both left- and right-anchored.  The individual
1014  *                domain names are compared with simplematch().
1015  *                This is only used by domain_match.
1016  *
1017  * Parameters  :
1018  *          1  :  pv = array of patterns to compare
1019  *          2  :  fv = array of domain components to compare
1020  *          3  :  len = length of the arrays (both arrays are the
1021  *                      same length - if they weren't, it couldn't
1022  *                      possibly be a match).
1023  *
1024  * Returns     :  0 => domains are equivalent, else no match.
1025  *
1026  *********************************************************************/
1027 static int simple_domaincmp(char **pv, char **fv, int len)
1028 {
1029    int n;
1030
1031    for (n = 0; n < len; n++)
1032    {
1033       if (simplematch(pv[n], fv[n]))
1034       {
1035          return 1;
1036       }
1037    }
1038
1039    return 0;
1040
1041 }
1042
1043
1044 /*********************************************************************
1045  *
1046  * Function    :  domain_match
1047  *
1048  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1049  *                p.pattern->unachored, the comparison is un-, left-,
1050  *                right-anchored, or both.
1051  *                The individual domain names are compared with
1052  *                simplematch().
1053  *
1054  * Parameters  :
1055  *          1  :  p = a domain that may contain a '*' as a wildcard.
1056  *          2  :  fqdn = domain name against which the patterns are compared.
1057  *
1058  * Returns     :  0 => domains are equivalent, else no match.
1059  *
1060  *********************************************************************/
1061 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1062 {
1063    char **pv, **fv;  /* vectors  */
1064    int    plen, flen;
1065    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1066
1067    plen = p->pattern.url_spec.dcount;
1068    flen = fqdn->dcount;
1069
1070    if (flen < plen)
1071    {
1072       /* fqdn is too short to match this pattern */
1073       return 1;
1074    }
1075
1076    pv   = p->pattern.url_spec.dvec;
1077    fv   = fqdn->dvec;
1078
1079    if (unanchored == ANCHOR_LEFT)
1080    {
1081       /*
1082        * Right anchored.
1083        *
1084        * Convert this into a fully anchored pattern with
1085        * the fqdn and pattern the same length
1086        */
1087       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1088       return simple_domaincmp(pv, fv, plen);
1089    }
1090    else if (unanchored == 0)
1091    {
1092       /* Fully anchored, check length */
1093       if (flen != plen)
1094       {
1095          return 1;
1096       }
1097       return simple_domaincmp(pv, fv, plen);
1098    }
1099    else if (unanchored == ANCHOR_RIGHT)
1100    {
1101       /* Left anchored, ignore all extra in fqdn */
1102       return simple_domaincmp(pv, fv, plen);
1103    }
1104    else
1105    {
1106       /* Unanchored */
1107       int n;
1108       int maxn = flen - plen;
1109       for (n = 0; n <= maxn; n++)
1110       {
1111          if (!simple_domaincmp(pv, fv, plen))
1112          {
1113             return 0;
1114          }
1115          /*
1116           * Doesn't match from start of fqdn
1117           * Try skipping first part of fqdn
1118           */
1119          fv++;
1120       }
1121       return 1;
1122    }
1123
1124 }
1125 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1126
1127
1128 /*********************************************************************
1129  *
1130  * Function    :  create_pattern_spec
1131  *
1132  * Description :  Creates a "pattern_spec" structure from a string.
1133  *                When finished, free with free_pattern_spec().
1134  *
1135  * Parameters  :
1136  *          1  :  pattern = Target pattern_spec to be filled in.
1137  *                          Will be zeroed before use.
1138  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1139  *                      contents of this buffer are destroyed by this
1140  *                      function.  If this function succeeds, the
1141  *                      buffer is copied to pattern->spec.  If this
1142  *                      function fails, the contents of the buffer
1143  *                      are lost forever.
1144  *
1145  * Returns     :  JB_ERR_OK - Success
1146  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1147  *                               written to system log)
1148  *
1149  *********************************************************************/
1150 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1151 {
1152    static const struct
1153    {
1154       /** The tag pattern prefix to match */
1155       const char *prefix;
1156
1157       /** The length of the prefix to match */
1158       const size_t prefix_length;
1159
1160       /** The pattern flag */
1161       const unsigned flag;
1162    } tag_pattern[] = {
1163       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1164  #ifdef FEATURE_CLIENT_TAGS
1165       { "CLIENT-TAG:",      11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
1166  #endif
1167       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1168       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1169    };
1170    int i;
1171
1172    assert(pattern);
1173    assert(buf);
1174
1175    memset(pattern, '\0', sizeof(*pattern));
1176
1177    /* Remember the original specification for the CGI pages. */
1178    pattern->spec = strdup_or_die(buf);
1179
1180    /* Check if it's a tag pattern */
1181    for (i = 0; i < SZ(tag_pattern); i++)
1182    {
1183       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1184       {
1185          /* The regex starts after the prefix */
1186          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1187
1188          pattern->flags |= tag_pattern[i].flag;
1189
1190          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1191             &pattern->pattern.tag_regex);
1192       }
1193    }
1194
1195    /* If it isn't a tag pattern it must be an URL pattern. */
1196    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1197
1198    return compile_url_pattern(pattern, buf);
1199
1200 }
1201
1202
1203 /*********************************************************************
1204  *
1205  * Function    :  free_pattern_spec
1206  *
1207  * Description :  Called from the "unloaders".  Freez the pattern
1208  *                structure elements.
1209  *
1210  * Parameters  :
1211  *          1  :  pattern = pointer to a pattern_spec structure.
1212  *
1213  * Returns     :  N/A
1214  *
1215  *********************************************************************/
1216 void free_pattern_spec(struct pattern_spec *pattern)
1217 {
1218    if (pattern == NULL) return;
1219
1220    freez(pattern->spec);
1221 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1222    if (pattern->pattern.url_spec.host_regex)
1223    {
1224       regfree(pattern->pattern.url_spec.host_regex);
1225       freez(pattern->pattern.url_spec.host_regex);
1226    }
1227 #else
1228    freez(pattern->pattern.url_spec.dbuffer);
1229    freez(pattern->pattern.url_spec.dvec);
1230    pattern->pattern.url_spec.dcount = 0;
1231 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1232    freez(pattern->pattern.url_spec.port_list);
1233    if (pattern->pattern.url_spec.preg)
1234    {
1235       regfree(pattern->pattern.url_spec.preg);
1236       freez(pattern->pattern.url_spec.preg);
1237    }
1238    if (pattern->pattern.tag_regex)
1239    {
1240       regfree(pattern->pattern.tag_regex);
1241       freez(pattern->pattern.tag_regex);
1242    }
1243 }
1244
1245
1246 /*********************************************************************
1247  *
1248  * Function    :  port_matches
1249  *
1250  * Description :  Compares a port against a port list.
1251  *
1252  * Parameters  :
1253  *          1  :  port      = The port to check.
1254  *          2  :  port_list = The list of port to compare with.
1255  *
1256  * Returns     :  TRUE for yes, FALSE otherwise.
1257  *
1258  *********************************************************************/
1259 static int port_matches(const int port, const char *port_list)
1260 {
1261    return ((NULL == port_list) || match_portlist(port_list, port));
1262 }
1263
1264
1265 /*********************************************************************
1266  *
1267  * Function    :  host_matches
1268  *
1269  * Description :  Compares a host against a host pattern.
1270  *
1271  * Parameters  :
1272  *          1  :  url = The URL to match
1273  *          2  :  pattern = The URL pattern
1274  *
1275  * Returns     :  TRUE for yes, FALSE otherwise.
1276  *
1277  *********************************************************************/
1278 static int host_matches(const struct http_request *http,
1279                         const struct pattern_spec *pattern)
1280 {
1281    assert(http->host != NULL);
1282 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1283    return ((NULL == pattern->pattern.url_spec.host_regex)
1284       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1285 #else
1286    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1287 #endif
1288 }
1289
1290
1291 /*********************************************************************
1292  *
1293  * Function    :  path_matches
1294  *
1295  * Description :  Compares a path against a path pattern.
1296  *
1297  * Parameters  :
1298  *          1  :  path = The path to match
1299  *          2  :  pattern = The URL pattern
1300  *
1301  * Returns     :  TRUE for yes, FALSE otherwise.
1302  *
1303  *********************************************************************/
1304 static int path_matches(const char *path, const struct pattern_spec *pattern)
1305 {
1306    return ((NULL == pattern->pattern.url_spec.preg)
1307       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1308 }
1309
1310
1311 /*********************************************************************
1312  *
1313  * Function    :  url_match
1314  *
1315  * Description :  Compare a URL against a URL pattern.
1316  *
1317  * Parameters  :
1318  *          1  :  pattern = a URL pattern
1319  *          2  :  url = URL to match
1320  *
1321  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1322  *
1323  *********************************************************************/
1324 int url_match(const struct pattern_spec *pattern,
1325               const struct http_request *http)
1326 {
1327    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1328    {
1329       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1330       return 0;
1331    }
1332
1333    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1334       && host_matches(http, pattern) && path_matches(http->path, pattern));
1335
1336 }
1337
1338
1339 /*********************************************************************
1340  *
1341  * Function    :  match_portlist
1342  *
1343  * Description :  Check if a given number is covered by a comma
1344  *                separated list of numbers and ranges (a,b-c,d,..)
1345  *
1346  * Parameters  :
1347  *          1  :  portlist = String with list
1348  *          2  :  port = port to check
1349  *
1350  * Returns     :  0 => no match
1351  *                1 => match
1352  *
1353  *********************************************************************/
1354 int match_portlist(const char *portlist, int port)
1355 {
1356    char *min, *max, *next, *portlist_copy;
1357
1358    min = portlist_copy = strdup_or_die(portlist);
1359
1360    /*
1361     * Zero-terminate first item and remember offset for next
1362     */
1363    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1364    {
1365       *next++ = '\0';
1366    }
1367
1368    /*
1369     * Loop through all items, checking for match
1370     */
1371    while (NULL != min)
1372    {
1373       if (NULL == (max = strchr(min, (int) '-')))
1374       {
1375          /*
1376           * No dash, check for equality
1377           */
1378          if (port == atoi(min))
1379          {
1380             freez(portlist_copy);
1381             return(1);
1382          }
1383       }
1384       else
1385       {
1386          /*
1387           * This is a range, so check if between min and max,
1388           * or, if max was omitted, between min and 65K
1389           */
1390          *max++ = '\0';
1391          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1392          {
1393             freez(portlist_copy);
1394             return(1);
1395          }
1396
1397       }
1398
1399       /*
1400        * Jump to next item
1401        */
1402       min = next;
1403
1404       /*
1405        * Zero-terminate next item and remember offset for n+1
1406        */
1407       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1408       {
1409          *next++ = '\0';
1410       }
1411    }
1412
1413    freez(portlist_copy);
1414    return 0;
1415
1416 }
1417
1418
1419 /*********************************************************************
1420  *
1421  * Function    :  parse_forwarder_address
1422  *
1423  * Description :  Parse out the username, password, host and port from
1424  *                a forwarder address.
1425  *
1426  * Parameters  :
1427  *          1  :  address = The forwarder address to parse.
1428  *          2  :  hostname = Used to return the hostname. NULL on error.
1429  *          3  :  port = Used to return the port. Untouched if no port
1430  *                       is specified.
1431  *          4  :  username = Used to return the username if any.
1432  *          5  :  password = Used to return the password if any.
1433  *
1434  * Returns     :  JB_ERR_OK on success
1435  *                JB_ERR_MEMORY on out of memory
1436  *                JB_ERR_PARSE on malformed address.
1437  *
1438  *********************************************************************/
1439 jb_err parse_forwarder_address(char *address, char **hostname, int *port,
1440                                char **username, char **password)
1441 {
1442    char *p;
1443    char *tmp;
1444
1445    tmp = *hostname = strdup_or_die(address);
1446
1447    /* Parse username and password */
1448    if (username && password && (NULL != (p = strchr(*hostname, '@'))))
1449    {
1450       *p++ = '\0';
1451       *username = strdup_or_die(*hostname);
1452       *hostname = strdup_or_die(p);
1453
1454       if (NULL != (p = strchr(*username, ':')))
1455       {
1456          *p++ = '\0';
1457          *password = strdup_or_die(p);
1458       }
1459       freez(tmp);
1460    }
1461
1462    /* Parse hostname and port */
1463    p = *hostname;
1464    if ((*p == '[') && (NULL == strchr(p, ']')))
1465    {
1466       /* XXX: Should do some more validity checks here. */
1467       return JB_ERR_PARSE;
1468    }
1469
1470    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1471    {
1472       *p++ = '\0';
1473       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1474       if (*p == ':')
1475       {
1476          *port = (int)strtol(++p, NULL, 0);
1477       }
1478    }
1479    else if (NULL != (p = strchr(*hostname, ':')))
1480    {
1481       *p++ = '\0';
1482       *port = (int)strtol(p, NULL, 0);
1483    }
1484
1485    return JB_ERR_OK;
1486
1487 }
1488
1489
1490 /*
1491   Local Variables:
1492   tab-width: 3
1493   end:
1494 */