urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.85 2014/07/25 11:56:26 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2014
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457       /*
 458        * The PATCH method is defined by RFC5789, the format of the
 459        * actual patch in the body depends on the application, but from
 460        * Privoxy's point of view it doesn't matter.
 461        */
 462       "PATCH",
 463    };
 464    int i;
 465
 466    for (i = 0; i < SZ(known_http_methods); i++)
 467    {
 468       if (0 == strcmpic(method, known_http_methods[i]))
 469       {
 470          return FALSE;
 471       }
 472    }
 473
 474    return TRUE;
 475
 476 }
 477
 478
 479 /*********************************************************************
 480  *
 481  * Function    :  normalize_http_version
 482  *
 483  * Description :  Take a supported HTTP version string and remove
 484  *                leading zeroes etc., reject unsupported versions.
 485  *
 486  *                This is an explicit RFC 2616 (3.1) MUST and
 487  *                RFC 7230 mandates that intermediaries send their
 488  *                own HTTP-version in forwarded messages.
 489  *
 490  * Parameters  :
 491  *          1  :  http_version = HTTP version string
 492  *
 493  * Returns     :  JB_ERR_OK on success
 494  *                JB_ERR_PARSE if the HTTP version is unsupported
 495  *
 496  *********************************************************************/
 497 jb_err static normalize_http_version(char *http_version)
 498 {
 499    unsigned int major_version;
 500    unsigned int minor_version;
 501
 502    if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
 503    {
 504       log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
 505       return JB_ERR_PARSE;
 506    }
 507
 508    if (major_version != 1 || (minor_version != 0 && minor_version != 1))
 509    {
 510       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 511          "versions are 1.0 and 1.1. This rules out: %s", http_version);
 512       return JB_ERR_PARSE;
 513    }
 514
 515    assert(strlen(http_version) >= 8);
 516    snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
 517
 518    return JB_ERR_OK;
 519
 520 }
 521
 522
 523 /*********************************************************************
 524  *
 525  * Function    :  parse_http_request
 526  *
 527  * Description :  Parse out the host and port from the URL.  Find the
 528  *                hostname & path, port (if ':'), and/or password (if '@')
 529  *
 530  * Parameters  :
 531  *          1  :  req = HTTP request line to break down
 532  *          2  :  http = pointer to the http structure to hold elements
 533  *
 534  * Returns     :  JB_ERR_OK on success
 535  *                JB_ERR_CGI_PARAMS on malformed command/URL
 536  *                                  or >100 domains deep.
 537  *
 538  *********************************************************************/
 539 jb_err parse_http_request(const char *req, struct http_request *http)
 540 {
 541    char *buf;
 542    char *v[3];
 543    int n;
 544    jb_err err;
 545
 546    memset(http, '\0', sizeof(*http));
 547
 548    buf = strdup_or_die(req);
 549
 550    n = ssplit(buf, " \r\n", v, SZ(v));
 551    if (n != 3)
 552    {
 553       freez(buf);
 554       return JB_ERR_PARSE;
 555    }
 556
 557    /*
 558     * Fail in case of unknown methods
 559     * which we might not handle correctly.
 560     *
 561     * XXX: There should be a config option
 562     * to forward requests with unknown methods
 563     * anyway. Most of them don't need special
 564     * steps.
 565     */
 566    if (unknown_method(v[0]))
 567    {
 568       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 569       freez(buf);
 570       return JB_ERR_PARSE;
 571    }
 572
 573    if (JB_ERR_OK != normalize_http_version(v[2]))
 574    {
 575       freez(buf);
 576       return JB_ERR_PARSE;
 577    }
 578
 579    http->ssl = !strcmpic(v[0], "CONNECT");
 580
 581    err = parse_http_url(v[1], http, !http->ssl);
 582    if (err)
 583    {
 584       freez(buf);
 585       return err;
 586    }
 587
 588    /*
 589     * Copy the details into the structure
 590     */
 591    http->cmd = strdup_or_die(req);
 592    http->gpc = strdup_or_die(v[0]);
 593    http->ver = strdup_or_die(v[2]);
 594    http->ocmd = strdup_or_die(http->cmd);
 595
 596    freez(buf);
 597
 598    return JB_ERR_OK;
 599
 600 }
 601
 602
 603 /*********************************************************************
 604  *
 605  * Function    :  compile_pattern
 606  *
 607  * Description :  Compiles a host, domain or TAG pattern.
 608  *
 609  * Parameters  :
 610  *          1  :  pattern = The pattern to compile.
 611  *          2  :  anchoring = How the regex should be modified
 612  *                            before compilation. Can be either
 613  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 614  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 615  *          3  :  url     = In case of failures, the spec member is
 616  *                          logged and the structure freed.
 617  *          4  :  regex   = Where the compiled regex should be stored.
 618  *
 619  * Returns     :  JB_ERR_OK - Success
 620  *                JB_ERR_MEMORY - Out of memory
 621  *                JB_ERR_PARSE - Cannot parse regex
 622  *
 623  *********************************************************************/
 624 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 625                               struct pattern_spec *url, regex_t **regex)
 626 {
 627    int errcode;
 628    char rebuf[BUFFER_SIZE];
 629    const char *fmt = NULL;
 630
 631    assert(pattern);
 632    assert(strlen(pattern) < sizeof(rebuf) - 2);
 633
 634    if (pattern[0] == '\0')
 635    {
 636       *regex = NULL;
 637       return JB_ERR_OK;
 638    }
 639
 640    switch (anchoring)
 641    {
 642       case NO_ANCHORING:
 643          fmt = "%s";
 644          break;
 645       case RIGHT_ANCHORED:
 646          fmt = "%s$";
 647          break;
 648       case RIGHT_ANCHORED_HOST:
 649          fmt = "%s\\.?$";
 650          break;
 651       case LEFT_ANCHORED:
 652          fmt = "^%s";
 653          break;
 654       default:
 655          log_error(LOG_LEVEL_FATAL,
 656             "Invalid anchoring in compile_pattern %d", anchoring);
 657    }
 658
 659    *regex = zalloc(sizeof(**regex));
 660    if (NULL == *regex)
 661    {
 662       free_pattern_spec(url);
 663       return JB_ERR_MEMORY;
 664    }
 665
 666    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 667
 668    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 669
 670    if (errcode)
 671    {
 672       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 673       if (errlen > (sizeof(rebuf) - (size_t)1))
 674       {
 675          errlen = sizeof(rebuf) - (size_t)1;
 676       }
 677       rebuf[errlen] = '\0';
 678       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 679          pattern, url->spec, rebuf);
 680       free_pattern_spec(url);
 681
 682       return JB_ERR_PARSE;
 683    }
 684
 685    return JB_ERR_OK;
 686
 687 }
 688
 689
 690 /*********************************************************************
 691  *
 692  * Function    :  compile_url_pattern
 693  *
 694  * Description :  Compiles the three parts of an URL pattern.
 695  *
 696  * Parameters  :
 697  *          1  :  url = Target pattern_spec to be filled in.
 698  *          2  :  buf = The url pattern to compile. Will be messed up.
 699  *
 700  * Returns     :  JB_ERR_OK - Success
 701  *                JB_ERR_MEMORY - Out of memory
 702  *                JB_ERR_PARSE - Cannot parse regex
 703  *
 704  *********************************************************************/
 705 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 706 {
 707    char *p;
 708
 709    p = strchr(buf, '/');
 710    if (NULL != p)
 711    {
 712       /*
 713        * Only compile the regex if it consists of more than
 714        * a single slash, otherwise it wouldn't affect the result.
 715        */
 716       if (p[1] != '\0')
 717       {
 718          /*
 719           * XXX: does it make sense to compile the slash at the beginning?
 720           */
 721          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 722
 723          if (JB_ERR_OK != err)
 724          {
 725             return err;
 726          }
 727       }
 728       *p = '\0';
 729    }
 730
 731    /*
 732     * IPv6 numeric hostnames can contain colons, thus we need
 733     * to delimit the hostname before the real port separator.
 734     * As brackets are already used in the hostname pattern,
 735     * we use angle brackets ('<', '>') instead.
 736     */
 737    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 738    {
 739       *p++ = '\0';
 740       buf++;
 741
 742       if (*p == '\0')
 743       {
 744          /* IPv6 address without port number */
 745          p = NULL;
 746       }
 747       else if (*p != ':')
 748       {
 749          /* Garbage after address delimiter */
 750          return JB_ERR_PARSE;
 751       }
 752    }
 753    else
 754    {
 755       p = strchr(buf, ':');
 756    }
 757
 758    if (NULL != p)
 759    {
 760       *p++ = '\0';
 761       url->pattern.url_spec.port_list = strdup_or_die(p);
 762    }
 763    else
 764    {
 765       url->pattern.url_spec.port_list = NULL;
 766    }
 767
 768    if (buf[0] != '\0')
 769    {
 770       return compile_host_pattern(url, buf);
 771    }
 772
 773    return JB_ERR_OK;
 774
 775 }
 776
 777
 778 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 779 /*********************************************************************
 780  *
 781  * Function    :  compile_host_pattern
 782  *
 783  * Description :  Parses and compiles a host pattern.
 784  *
 785  * Parameters  :
 786  *          1  :  url = Target pattern_spec to be filled in.
 787  *          2  :  host_pattern = Host pattern to compile.
 788  *
 789  * Returns     :  JB_ERR_OK - Success
 790  *                JB_ERR_MEMORY - Out of memory
 791  *                JB_ERR_PARSE - Cannot parse regex
 792  *
 793  *********************************************************************/
 794 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 795 {
 796    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 797 }
 798
 799 #else
 800
 801 /*********************************************************************
 802  *
 803  * Function    :  compile_host_pattern
 804  *
 805  * Description :  Parses and "compiles" an old-school host pattern.
 806  *
 807  * Parameters  :
 808  *          1  :  url = Target pattern_spec to be filled in.
 809  *          2  :  host_pattern = Host pattern to parse.
 810  *
 811  * Returns     :  JB_ERR_OK - Success
 812  *                JB_ERR_PARSE - Cannot parse regex
 813  *
 814  *********************************************************************/
 815 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 816 {
 817    char *v[150];
 818    size_t size;
 819    char *p;
 820
 821    /*
 822     * Parse domain part
 823     */
 824    if (host_pattern[strlen(host_pattern) - 1] == '.')
 825    {
 826       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 827    }
 828    if (host_pattern[0] == '.')
 829    {
 830       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 831    }
 832
 833    /*
 834     * Split domain into components
 835     */
 836    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 837
 838    /*
 839     * Map to lower case
 840     */
 841    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 842    {
 843       *p = (char)privoxy_tolower(*p);
 844    }
 845
 846    /*
 847     * Split the domain name into components
 848     */
 849    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 850
 851    if (url->pattern.url_spec.dcount < 0)
 852    {
 853       free_pattern_spec(url);
 854       return JB_ERR_PARSE;
 855    }
 856    else if (url->pattern.url_spec.dcount != 0)
 857    {
 858       /*
 859        * Save a copy of the pointers in dvec
 860        */
 861       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 862
 863       url->pattern.url_spec.dvec = malloc_or_die(size);
 864
 865       memcpy(url->pattern.url_spec.dvec, v, size);
 866    }
 867    /*
 868     * else dcount == 0 in which case we needn't do anything,
 869     * since dvec will never be accessed and the pattern will
 870     * match all domains.
 871     */
 872    return JB_ERR_OK;
 873 }
 874
 875
 876 /*********************************************************************
 877  *
 878  * Function    :  simplematch
 879  *
 880  * Description :  String matching, with a (greedy) '*' wildcard that
 881  *                stands for zero or more arbitrary characters and
 882  *                character classes in [], which take both enumerations
 883  *                and ranges.
 884  *
 885  * Parameters  :
 886  *          1  :  pattern = pattern for matching
 887  *          2  :  text    = text to be matched
 888  *
 889  * Returns     :  0 if match, else nonzero
 890  *
 891  *********************************************************************/
 892 static int simplematch(const char *pattern, const char *text)
 893 {
 894    const unsigned char *pat = (const unsigned char *)pattern;
 895    const unsigned char *txt = (const unsigned char *)text;
 896    const unsigned char *fallback = pat;
 897    int wildcard = 0;
 898
 899    unsigned char lastchar = 'a';
 900    unsigned i;
 901    unsigned char charmap[32];
 902
 903    while (*txt)
 904    {
 905
 906       /* EOF pattern but !EOF text? */
 907       if (*pat == '\0')
 908       {
 909          if (wildcard)
 910          {
 911             pat = fallback;
 912          }
 913          else
 914          {
 915             return 1;
 916          }
 917       }
 918
 919       /* '*' in the pattern?  */
 920       if (*pat == '*')
 921       {
 922
 923          /* The pattern ends afterwards? Speed up the return. */
 924          if (*++pat == '\0')
 925          {
 926             return 0;
 927          }
 928
 929          /* Else, set wildcard mode and remember position after '*' */
 930          wildcard = 1;
 931          fallback = pat;
 932       }
 933
 934       /* Character range specification? */
 935       if (*pat == '[')
 936       {
 937          memset(charmap, '\0', sizeof(charmap));
 938
 939          while (*++pat != ']')
 940          {
 941             if (!*pat)
 942             {
 943                return 1;
 944             }
 945             else if (*pat == '-')
 946             {
 947                if ((*++pat == ']') || *pat == '\0')
 948                {
 949                   return(1);
 950                }
 951                for (i = lastchar; i <= *pat; i++)
 952                {
 953                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 954                }
 955             }
 956             else
 957             {
 958                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 959                lastchar = *pat;
 960             }
 961          }
 962       } /* -END- if Character range specification */
 963
 964
 965       /*
 966        * Char match, or char range match?
 967        */
 968       if ((*pat == *txt)
 969        || (*pat == '?')
 970        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 971       {
 972          /*
 973           * Success: Go ahead
 974           */
 975          pat++;
 976       }
 977       else if (!wildcard)
 978       {
 979          /*
 980           * No match && no wildcard: No luck
 981           */
 982          return 1;
 983       }
 984       else if (pat != fallback)
 985       {
 986          /*
 987           * Increment text pointer if in char range matching
 988           */
 989          if (*pat == ']')
 990          {
 991             txt++;
 992          }
 993          /*
 994           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 995           */
 996          pat = fallback;
 997          /*
 998           * Restart matching from current text pointer
 999           */
1000          continue;
1001       }
1002       txt++;
1003    }
1004
1005    /* Cut off extra '*'s */
1006    if (*pat == '*') pat++;
1007
1008    /* If this is the pattern's end, fine! */
1009    return(*pat);
1010
1011 }
1012
1013
1014 /*********************************************************************
1015  *
1016  * Function    :  simple_domaincmp
1017  *
1018  * Description :  Domain-wise Compare fqdn's.  The comparison is
1019  *                both left- and right-anchored.  The individual
1020  *                domain names are compared with simplematch().
1021  *                This is only used by domain_match.
1022  *
1023  * Parameters  :
1024  *          1  :  pv = array of patterns to compare
1025  *          2  :  fv = array of domain components to compare
1026  *          3  :  len = length of the arrays (both arrays are the
1027  *                      same length - if they weren't, it couldn't
1028  *                      possibly be a match).
1029  *
1030  * Returns     :  0 => domains are equivalent, else no match.
1031  *
1032  *********************************************************************/
1033 static int simple_domaincmp(char **pv, char **fv, int len)
1034 {
1035    int n;
1036
1037    for (n = 0; n < len; n++)
1038    {
1039       if (simplematch(pv[n], fv[n]))
1040       {
1041          return 1;
1042       }
1043    }
1044
1045    return 0;
1046
1047 }
1048
1049
1050 /*********************************************************************
1051  *
1052  * Function    :  domain_match
1053  *
1054  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1055  *                p.pattern->unachored, the comparison is un-, left-,
1056  *                right-anchored, or both.
1057  *                The individual domain names are compared with
1058  *                simplematch().
1059  *
1060  * Parameters  :
1061  *          1  :  p = a domain that may contain a '*' as a wildcard.
1062  *          2  :  fqdn = domain name against which the patterns are compared.
1063  *
1064  * Returns     :  0 => domains are equivalent, else no match.
1065  *
1066  *********************************************************************/
1067 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1068 {
1069    char **pv, **fv;  /* vectors  */
1070    int    plen, flen;
1071    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1072
1073    plen = p->pattern.url_spec.dcount;
1074    flen = fqdn->dcount;
1075
1076    if (flen < plen)
1077    {
1078       /* fqdn is too short to match this pattern */
1079       return 1;
1080    }
1081
1082    pv   = p->pattern.url_spec.dvec;
1083    fv   = fqdn->dvec;
1084
1085    if (unanchored == ANCHOR_LEFT)
1086    {
1087       /*
1088        * Right anchored.
1089        *
1090        * Convert this into a fully anchored pattern with
1091        * the fqdn and pattern the same length
1092        */
1093       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1094       return simple_domaincmp(pv, fv, plen);
1095    }
1096    else if (unanchored == 0)
1097    {
1098       /* Fully anchored, check length */
1099       if (flen != plen)
1100       {
1101          return 1;
1102       }
1103       return simple_domaincmp(pv, fv, plen);
1104    }
1105    else if (unanchored == ANCHOR_RIGHT)
1106    {
1107       /* Left anchored, ignore all extra in fqdn */
1108       return simple_domaincmp(pv, fv, plen);
1109    }
1110    else
1111    {
1112       /* Unanchored */
1113       int n;
1114       int maxn = flen - plen;
1115       for (n = 0; n <= maxn; n++)
1116       {
1117          if (!simple_domaincmp(pv, fv, plen))
1118          {
1119             return 0;
1120          }
1121          /*
1122           * Doesn't match from start of fqdn
1123           * Try skipping first part of fqdn
1124           */
1125          fv++;
1126       }
1127       return 1;
1128    }
1129
1130 }
1131 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1132
1133
1134 /*********************************************************************
1135  *
1136  * Function    :  create_pattern_spec
1137  *
1138  * Description :  Creates a "pattern_spec" structure from a string.
1139  *                When finished, free with free_pattern_spec().
1140  *
1141  * Parameters  :
1142  *          1  :  pattern = Target pattern_spec to be filled in.
1143  *                          Will be zeroed before use.
1144  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1145  *                      contents of this buffer are destroyed by this
1146  *                      function.  If this function succeeds, the
1147  *                      buffer is copied to pattern->spec.  If this
1148  *                      function fails, the contents of the buffer
1149  *                      are lost forever.
1150  *
1151  * Returns     :  JB_ERR_OK - Success
1152  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1153  *                               written to system log)
1154  *
1155  *********************************************************************/
1156 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1157 {
1158    static const struct
1159    {
1160       /** The tag pattern prefix to match */
1161       const char *prefix;
1162
1163       /** The length of the prefix to match */
1164       const size_t prefix_length;
1165
1166       /** The pattern flag */
1167       const unsigned flag;
1168    } tag_pattern[] = {
1169       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1170       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1171       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1172    };
1173    int i;
1174
1175    assert(pattern);
1176    assert(buf);
1177
1178    memset(pattern, '\0', sizeof(*pattern));
1179
1180    /* Remember the original specification for the CGI pages. */
1181    pattern->spec = strdup_or_die(buf);
1182
1183    /* Check if it's a tag pattern */
1184    for (i = 0; i < SZ(tag_pattern); i++)
1185    {
1186       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1187       {
1188          /* The regex starts after the prefix */
1189          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1190
1191          pattern->flags |= tag_pattern[i].flag;
1192
1193          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1194             &pattern->pattern.tag_regex);
1195       }
1196    }
1197
1198    /* If it isn't a tag pattern it must be an URL pattern. */
1199    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1200
1201    return compile_url_pattern(pattern, buf);
1202
1203 }
1204
1205
1206 /*********************************************************************
1207  *
1208  * Function    :  free_pattern_spec
1209  *
1210  * Description :  Called from the "unloaders".  Freez the pattern
1211  *                structure elements.
1212  *
1213  * Parameters  :
1214  *          1  :  pattern = pointer to a pattern_spec structure.
1215  *
1216  * Returns     :  N/A
1217  *
1218  *********************************************************************/
1219 void free_pattern_spec(struct pattern_spec *pattern)
1220 {
1221    if (pattern == NULL) return;
1222
1223    freez(pattern->spec);
1224 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1225    if (pattern->pattern.url_spec.host_regex)
1226    {
1227       regfree(pattern->pattern.url_spec.host_regex);
1228       freez(pattern->pattern.url_spec.host_regex);
1229    }
1230 #else
1231    freez(pattern->pattern.url_spec.dbuffer);
1232    freez(pattern->pattern.url_spec.dvec);
1233    pattern->pattern.url_spec.dcount = 0;
1234 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1235    freez(pattern->pattern.url_spec.port_list);
1236    if (pattern->pattern.url_spec.preg)
1237    {
1238       regfree(pattern->pattern.url_spec.preg);
1239       freez(pattern->pattern.url_spec.preg);
1240    }
1241    if (pattern->pattern.tag_regex)
1242    {
1243       regfree(pattern->pattern.tag_regex);
1244       freez(pattern->pattern.tag_regex);
1245    }
1246 }
1247
1248
1249 /*********************************************************************
1250  *
1251  * Function    :  port_matches
1252  *
1253  * Description :  Compares a port against a port list.
1254  *
1255  * Parameters  :
1256  *          1  :  port      = The port to check.
1257  *          2  :  port_list = The list of port to compare with.
1258  *
1259  * Returns     :  TRUE for yes, FALSE otherwise.
1260  *
1261  *********************************************************************/
1262 static int port_matches(const int port, const char *port_list)
1263 {
1264    return ((NULL == port_list) || match_portlist(port_list, port));
1265 }
1266
1267
1268 /*********************************************************************
1269  *
1270  * Function    :  host_matches
1271  *
1272  * Description :  Compares a host against a host pattern.
1273  *
1274  * Parameters  :
1275  *          1  :  url = The URL to match
1276  *          2  :  pattern = The URL pattern
1277  *
1278  * Returns     :  TRUE for yes, FALSE otherwise.
1279  *
1280  *********************************************************************/
1281 static int host_matches(const struct http_request *http,
1282                         const struct pattern_spec *pattern)
1283 {
1284    assert(http->host != NULL);
1285 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1286    return ((NULL == pattern->pattern.url_spec.host_regex)
1287       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1288 #else
1289    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1290 #endif
1291 }
1292
1293
1294 /*********************************************************************
1295  *
1296  * Function    :  path_matches
1297  *
1298  * Description :  Compares a path against a path pattern.
1299  *
1300  * Parameters  :
1301  *          1  :  path = The path to match
1302  *          2  :  pattern = The URL pattern
1303  *
1304  * Returns     :  TRUE for yes, FALSE otherwise.
1305  *
1306  *********************************************************************/
1307 static int path_matches(const char *path, const struct pattern_spec *pattern)
1308 {
1309    return ((NULL == pattern->pattern.url_spec.preg)
1310       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1311 }
1312
1313
1314 /*********************************************************************
1315  *
1316  * Function    :  url_match
1317  *
1318  * Description :  Compare a URL against a URL pattern.
1319  *
1320  * Parameters  :
1321  *          1  :  pattern = a URL pattern
1322  *          2  :  url = URL to match
1323  *
1324  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1325  *
1326  *********************************************************************/
1327 int url_match(const struct pattern_spec *pattern,
1328               const struct http_request *http)
1329 {
1330    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1331    {
1332       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1333       return 0;
1334    }
1335
1336    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1337       && host_matches(http, pattern) && path_matches(http->path, pattern));
1338
1339 }
1340
1341
1342 /*********************************************************************
1343  *
1344  * Function    :  match_portlist
1345  *
1346  * Description :  Check if a given number is covered by a comma
1347  *                separated list of numbers and ranges (a,b-c,d,..)
1348  *
1349  * Parameters  :
1350  *          1  :  portlist = String with list
1351  *          2  :  port = port to check
1352  *
1353  * Returns     :  0 => no match
1354  *                1 => match
1355  *
1356  *********************************************************************/
1357 int match_portlist(const char *portlist, int port)
1358 {
1359    char *min, *max, *next, *portlist_copy;
1360
1361    min = portlist_copy = strdup_or_die(portlist);
1362
1363    /*
1364     * Zero-terminate first item and remember offset for next
1365     */
1366    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1367    {
1368       *next++ = '\0';
1369    }
1370
1371    /*
1372     * Loop through all items, checking for match
1373     */
1374    while (NULL != min)
1375    {
1376       if (NULL == (max = strchr(min, (int) '-')))
1377       {
1378          /*
1379           * No dash, check for equality
1380           */
1381          if (port == atoi(min))
1382          {
1383             freez(portlist_copy);
1384             return(1);
1385          }
1386       }
1387       else
1388       {
1389          /*
1390           * This is a range, so check if between min and max,
1391           * or, if max was omitted, between min and 65K
1392           */
1393          *max++ = '\0';
1394          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1395          {
1396             freez(portlist_copy);
1397             return(1);
1398          }
1399
1400       }
1401
1402       /*
1403        * Jump to next item
1404        */
1405       min = next;
1406
1407       /*
1408        * Zero-terminate next item and remember offset for n+1
1409        */
1410       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1411       {
1412          *next++ = '\0';
1413       }
1414    }
1415
1416    freez(portlist_copy);
1417    return 0;
1418
1419 }
1420
1421
1422 /*********************************************************************
1423  *
1424  * Function    :  parse_forwarder_address
1425  *
1426  * Description :  Parse out the host and port from a forwarder address.
1427  *
1428  * Parameters  :
1429  *          1  :  address = The forwarder address to parse.
1430  *          2  :  hostname = Used to return the hostname. NULL on error.
1431  *          3  :  port = Used to return the port. Untouched if no port
1432  *                       is specified.
1433  *
1434  * Returns     :  JB_ERR_OK on success
1435  *                JB_ERR_MEMORY on out of memory
1436  *                JB_ERR_PARSE on malformed address.
1437  *
1438  *********************************************************************/
1439 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1440 {
1441    char *p = address;
1442
1443    if ((*address == '[') && (NULL == strchr(address, ']')))
1444    {
1445       /* XXX: Should do some more validity checks here. */
1446       return JB_ERR_PARSE;
1447    }
1448
1449    *hostname = strdup_or_die(address);
1450
1451    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1452    {
1453       *p++ = '\0';
1454       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1455       if (*p == ':')
1456       {
1457          *port = (int)strtol(++p, NULL, 0);
1458       }
1459    }
1460    else if (NULL != (p = strchr(*hostname, ':')))
1461    {
1462       *p++ = '\0';
1463       *port = (int)strtol(p, NULL, 0);
1464    }
1465
1466    return JB_ERR_OK;
1467
1468 }
1469
1470
1471 /*
1472   Local Variables:
1473   tab-width: 3
1474   end:
1475 */