urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.83 2014/06/20 09:46:56 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2014
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457       /*
 458        * The PATCH method is defined by RFC5789, the format of the
 459        * actual patch in the body depends on the application, but from
 460        * Privoxy's point of view it doesn't matter.
 461        */
 462       "PATCH",
 463    };
 464    int i;
 465
 466    for (i = 0; i < SZ(known_http_methods); i++)
 467    {
 468       if (0 == strcmpic(method, known_http_methods[i]))
 469       {
 470          return FALSE;
 471       }
 472    }
 473
 474    return TRUE;
 475
 476 }
 477
 478
 479 /*********************************************************************
 480  *
 481  * Function    :  parse_http_request
 482  *
 483  * Description :  Parse out the host and port from the URL.  Find the
 484  *                hostname & path, port (if ':'), and/or password (if '@')
 485  *
 486  * Parameters  :
 487  *          1  :  req = HTTP request line to break down
 488  *          2  :  http = pointer to the http structure to hold elements
 489  *
 490  * Returns     :  JB_ERR_OK on success
 491  *                JB_ERR_CGI_PARAMS on malformed command/URL
 492  *                                  or >100 domains deep.
 493  *
 494  *********************************************************************/
 495 jb_err parse_http_request(const char *req, struct http_request *http)
 496 {
 497    char *buf;
 498    char *v[3];
 499    int n;
 500    jb_err err;
 501
 502    memset(http, '\0', sizeof(*http));
 503
 504    buf = strdup_or_die(req);
 505
 506    n = ssplit(buf, " \r\n", v, SZ(v));
 507    if (n != 3)
 508    {
 509       freez(buf);
 510       return JB_ERR_PARSE;
 511    }
 512
 513    /*
 514     * Fail in case of unknown methods
 515     * which we might not handle correctly.
 516     *
 517     * XXX: There should be a config option
 518     * to forward requests with unknown methods
 519     * anyway. Most of them don't need special
 520     * steps.
 521     */
 522    if (unknown_method(v[0]))
 523    {
 524       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 525       freez(buf);
 526       return JB_ERR_PARSE;
 527    }
 528
 529    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 530    {
 531       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 532          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 533       freez(buf);
 534       return JB_ERR_PARSE;
 535    }
 536
 537    http->ssl = !strcmpic(v[0], "CONNECT");
 538
 539    err = parse_http_url(v[1], http, !http->ssl);
 540    if (err)
 541    {
 542       freez(buf);
 543       return err;
 544    }
 545
 546    /*
 547     * Copy the details into the structure
 548     */
 549    http->cmd = strdup_or_die(req);
 550    http->gpc = strdup_or_die(v[0]);
 551    http->ver = strdup_or_die(v[2]);
 552    http->ocmd = strdup_or_die(http->cmd);
 553
 554    freez(buf);
 555
 556    return JB_ERR_OK;
 557
 558 }
 559
 560
 561 /*********************************************************************
 562  *
 563  * Function    :  compile_pattern
 564  *
 565  * Description :  Compiles a host, domain or TAG pattern.
 566  *
 567  * Parameters  :
 568  *          1  :  pattern = The pattern to compile.
 569  *          2  :  anchoring = How the regex should be modified
 570  *                            before compilation. Can be either
 571  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 572  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 573  *          3  :  url     = In case of failures, the spec member is
 574  *                          logged and the structure freed.
 575  *          4  :  regex   = Where the compiled regex should be stored.
 576  *
 577  * Returns     :  JB_ERR_OK - Success
 578  *                JB_ERR_MEMORY - Out of memory
 579  *                JB_ERR_PARSE - Cannot parse regex
 580  *
 581  *********************************************************************/
 582 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 583                               struct pattern_spec *url, regex_t **regex)
 584 {
 585    int errcode;
 586    char rebuf[BUFFER_SIZE];
 587    const char *fmt = NULL;
 588
 589    assert(pattern);
 590    assert(strlen(pattern) < sizeof(rebuf) - 2);
 591
 592    if (pattern[0] == '\0')
 593    {
 594       *regex = NULL;
 595       return JB_ERR_OK;
 596    }
 597
 598    switch (anchoring)
 599    {
 600       case NO_ANCHORING:
 601          fmt = "%s";
 602          break;
 603       case RIGHT_ANCHORED:
 604          fmt = "%s$";
 605          break;
 606       case RIGHT_ANCHORED_HOST:
 607          fmt = "%s\\.?$";
 608          break;
 609       case LEFT_ANCHORED:
 610          fmt = "^%s";
 611          break;
 612       default:
 613          log_error(LOG_LEVEL_FATAL,
 614             "Invalid anchoring in compile_pattern %d", anchoring);
 615    }
 616
 617    *regex = zalloc(sizeof(**regex));
 618    if (NULL == *regex)
 619    {
 620       free_pattern_spec(url);
 621       return JB_ERR_MEMORY;
 622    }
 623
 624    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 625
 626    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 627
 628    if (errcode)
 629    {
 630       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 631       if (errlen > (sizeof(rebuf) - (size_t)1))
 632       {
 633          errlen = sizeof(rebuf) - (size_t)1;
 634       }
 635       rebuf[errlen] = '\0';
 636       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 637          pattern, url->spec, rebuf);
 638       free_pattern_spec(url);
 639
 640       return JB_ERR_PARSE;
 641    }
 642
 643    return JB_ERR_OK;
 644
 645 }
 646
 647
 648 /*********************************************************************
 649  *
 650  * Function    :  compile_url_pattern
 651  *
 652  * Description :  Compiles the three parts of an URL pattern.
 653  *
 654  * Parameters  :
 655  *          1  :  url = Target pattern_spec to be filled in.
 656  *          2  :  buf = The url pattern to compile. Will be messed up.
 657  *
 658  * Returns     :  JB_ERR_OK - Success
 659  *                JB_ERR_MEMORY - Out of memory
 660  *                JB_ERR_PARSE - Cannot parse regex
 661  *
 662  *********************************************************************/
 663 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 664 {
 665    char *p;
 666
 667    p = strchr(buf, '/');
 668    if (NULL != p)
 669    {
 670       /*
 671        * Only compile the regex if it consists of more than
 672        * a single slash, otherwise it wouldn't affect the result.
 673        */
 674       if (p[1] != '\0')
 675       {
 676          /*
 677           * XXX: does it make sense to compile the slash at the beginning?
 678           */
 679          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 680
 681          if (JB_ERR_OK != err)
 682          {
 683             return err;
 684          }
 685       }
 686       *p = '\0';
 687    }
 688
 689    /*
 690     * IPv6 numeric hostnames can contain colons, thus we need
 691     * to delimit the hostname before the real port separator.
 692     * As brackets are already used in the hostname pattern,
 693     * we use angle brackets ('<', '>') instead.
 694     */
 695    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 696    {
 697       *p++ = '\0';
 698       buf++;
 699
 700       if (*p == '\0')
 701       {
 702          /* IPv6 address without port number */
 703          p = NULL;
 704       }
 705       else if (*p != ':')
 706       {
 707          /* Garbage after address delimiter */
 708          return JB_ERR_PARSE;
 709       }
 710    }
 711    else
 712    {
 713       p = strchr(buf, ':');
 714    }
 715
 716    if (NULL != p)
 717    {
 718       *p++ = '\0';
 719       url->pattern.url_spec.port_list = strdup_or_die(p);
 720    }
 721    else
 722    {
 723       url->pattern.url_spec.port_list = NULL;
 724    }
 725
 726    if (buf[0] != '\0')
 727    {
 728       return compile_host_pattern(url, buf);
 729    }
 730
 731    return JB_ERR_OK;
 732
 733 }
 734
 735
 736 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 737 /*********************************************************************
 738  *
 739  * Function    :  compile_host_pattern
 740  *
 741  * Description :  Parses and compiles a host pattern.
 742  *
 743  * Parameters  :
 744  *          1  :  url = Target pattern_spec to be filled in.
 745  *          2  :  host_pattern = Host pattern to compile.
 746  *
 747  * Returns     :  JB_ERR_OK - Success
 748  *                JB_ERR_MEMORY - Out of memory
 749  *                JB_ERR_PARSE - Cannot parse regex
 750  *
 751  *********************************************************************/
 752 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 753 {
 754    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 755 }
 756
 757 #else
 758
 759 /*********************************************************************
 760  *
 761  * Function    :  compile_host_pattern
 762  *
 763  * Description :  Parses and "compiles" an old-school host pattern.
 764  *
 765  * Parameters  :
 766  *          1  :  url = Target pattern_spec to be filled in.
 767  *          2  :  host_pattern = Host pattern to parse.
 768  *
 769  * Returns     :  JB_ERR_OK - Success
 770  *                JB_ERR_PARSE - Cannot parse regex
 771  *
 772  *********************************************************************/
 773 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 774 {
 775    char *v[150];
 776    size_t size;
 777    char *p;
 778
 779    /*
 780     * Parse domain part
 781     */
 782    if (host_pattern[strlen(host_pattern) - 1] == '.')
 783    {
 784       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 785    }
 786    if (host_pattern[0] == '.')
 787    {
 788       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 789    }
 790
 791    /*
 792     * Split domain into components
 793     */
 794    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 795
 796    /*
 797     * Map to lower case
 798     */
 799    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 800    {
 801       *p = (char)privoxy_tolower(*p);
 802    }
 803
 804    /*
 805     * Split the domain name into components
 806     */
 807    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 808
 809    if (url->pattern.url_spec.dcount < 0)
 810    {
 811       free_pattern_spec(url);
 812       return JB_ERR_PARSE;
 813    }
 814    else if (url->pattern.url_spec.dcount != 0)
 815    {
 816       /*
 817        * Save a copy of the pointers in dvec
 818        */
 819       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 820
 821       url->pattern.url_spec.dvec = malloc_or_die(size);
 822
 823       memcpy(url->pattern.url_spec.dvec, v, size);
 824    }
 825    /*
 826     * else dcount == 0 in which case we needn't do anything,
 827     * since dvec will never be accessed and the pattern will
 828     * match all domains.
 829     */
 830    return JB_ERR_OK;
 831 }
 832
 833
 834 /*********************************************************************
 835  *
 836  * Function    :  simplematch
 837  *
 838  * Description :  String matching, with a (greedy) '*' wildcard that
 839  *                stands for zero or more arbitrary characters and
 840  *                character classes in [], which take both enumerations
 841  *                and ranges.
 842  *
 843  * Parameters  :
 844  *          1  :  pattern = pattern for matching
 845  *          2  :  text    = text to be matched
 846  *
 847  * Returns     :  0 if match, else nonzero
 848  *
 849  *********************************************************************/
 850 static int simplematch(const char *pattern, const char *text)
 851 {
 852    const unsigned char *pat = (const unsigned char *)pattern;
 853    const unsigned char *txt = (const unsigned char *)text;
 854    const unsigned char *fallback = pat;
 855    int wildcard = 0;
 856
 857    unsigned char lastchar = 'a';
 858    unsigned i;
 859    unsigned char charmap[32];
 860
 861    while (*txt)
 862    {
 863
 864       /* EOF pattern but !EOF text? */
 865       if (*pat == '\0')
 866       {
 867          if (wildcard)
 868          {
 869             pat = fallback;
 870          }
 871          else
 872          {
 873             return 1;
 874          }
 875       }
 876
 877       /* '*' in the pattern?  */
 878       if (*pat == '*')
 879       {
 880
 881          /* The pattern ends afterwards? Speed up the return. */
 882          if (*++pat == '\0')
 883          {
 884             return 0;
 885          }
 886
 887          /* Else, set wildcard mode and remember position after '*' */
 888          wildcard = 1;
 889          fallback = pat;
 890       }
 891
 892       /* Character range specification? */
 893       if (*pat == '[')
 894       {
 895          memset(charmap, '\0', sizeof(charmap));
 896
 897          while (*++pat != ']')
 898          {
 899             if (!*pat)
 900             {
 901                return 1;
 902             }
 903             else if (*pat == '-')
 904             {
 905                if ((*++pat == ']') || *pat == '\0')
 906                {
 907                   return(1);
 908                }
 909                for (i = lastchar; i <= *pat; i++)
 910                {
 911                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 912                }
 913             }
 914             else
 915             {
 916                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 917                lastchar = *pat;
 918             }
 919          }
 920       } /* -END- if Character range specification */
 921
 922
 923       /*
 924        * Char match, or char range match?
 925        */
 926       if ((*pat == *txt)
 927        || (*pat == '?')
 928        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 929       {
 930          /*
 931           * Success: Go ahead
 932           */
 933          pat++;
 934       }
 935       else if (!wildcard)
 936       {
 937          /*
 938           * No match && no wildcard: No luck
 939           */
 940          return 1;
 941       }
 942       else if (pat != fallback)
 943       {
 944          /*
 945           * Increment text pointer if in char range matching
 946           */
 947          if (*pat == ']')
 948          {
 949             txt++;
 950          }
 951          /*
 952           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 953           */
 954          pat = fallback;
 955          /*
 956           * Restart matching from current text pointer
 957           */
 958          continue;
 959       }
 960       txt++;
 961    }
 962
 963    /* Cut off extra '*'s */
 964    if (*pat == '*') pat++;
 965
 966    /* If this is the pattern's end, fine! */
 967    return(*pat);
 968
 969 }
 970
 971
 972 /*********************************************************************
 973  *
 974  * Function    :  simple_domaincmp
 975  *
 976  * Description :  Domain-wise Compare fqdn's.  The comparison is
 977  *                both left- and right-anchored.  The individual
 978  *                domain names are compared with simplematch().
 979  *                This is only used by domain_match.
 980  *
 981  * Parameters  :
 982  *          1  :  pv = array of patterns to compare
 983  *          2  :  fv = array of domain components to compare
 984  *          3  :  len = length of the arrays (both arrays are the
 985  *                      same length - if they weren't, it couldn't
 986  *                      possibly be a match).
 987  *
 988  * Returns     :  0 => domains are equivalent, else no match.
 989  *
 990  *********************************************************************/
 991 static int simple_domaincmp(char **pv, char **fv, int len)
 992 {
 993    int n;
 994
 995    for (n = 0; n < len; n++)
 996    {
 997       if (simplematch(pv[n], fv[n]))
 998       {
 999          return 1;
1000       }
1001    }
1002
1003    return 0;
1004
1005 }
1006
1007
1008 /*********************************************************************
1009  *
1010  * Function    :  domain_match
1011  *
1012  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1013  *                p.pattern->unachored, the comparison is un-, left-,
1014  *                right-anchored, or both.
1015  *                The individual domain names are compared with
1016  *                simplematch().
1017  *
1018  * Parameters  :
1019  *          1  :  p = a domain that may contain a '*' as a wildcard.
1020  *          2  :  fqdn = domain name against which the patterns are compared.
1021  *
1022  * Returns     :  0 => domains are equivalent, else no match.
1023  *
1024  *********************************************************************/
1025 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1026 {
1027    char **pv, **fv;  /* vectors  */
1028    int    plen, flen;
1029    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1030
1031    plen = p->pattern.url_spec.dcount;
1032    flen = fqdn->dcount;
1033
1034    if (flen < plen)
1035    {
1036       /* fqdn is too short to match this pattern */
1037       return 1;
1038    }
1039
1040    pv   = p->pattern.url_spec.dvec;
1041    fv   = fqdn->dvec;
1042
1043    if (unanchored == ANCHOR_LEFT)
1044    {
1045       /*
1046        * Right anchored.
1047        *
1048        * Convert this into a fully anchored pattern with
1049        * the fqdn and pattern the same length
1050        */
1051       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1052       return simple_domaincmp(pv, fv, plen);
1053    }
1054    else if (unanchored == 0)
1055    {
1056       /* Fully anchored, check length */
1057       if (flen != plen)
1058       {
1059          return 1;
1060       }
1061       return simple_domaincmp(pv, fv, plen);
1062    }
1063    else if (unanchored == ANCHOR_RIGHT)
1064    {
1065       /* Left anchored, ignore all extra in fqdn */
1066       return simple_domaincmp(pv, fv, plen);
1067    }
1068    else
1069    {
1070       /* Unanchored */
1071       int n;
1072       int maxn = flen - plen;
1073       for (n = 0; n <= maxn; n++)
1074       {
1075          if (!simple_domaincmp(pv, fv, plen))
1076          {
1077             return 0;
1078          }
1079          /*
1080           * Doesn't match from start of fqdn
1081           * Try skipping first part of fqdn
1082           */
1083          fv++;
1084       }
1085       return 1;
1086    }
1087
1088 }
1089 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1090
1091
1092 /*********************************************************************
1093  *
1094  * Function    :  create_pattern_spec
1095  *
1096  * Description :  Creates a "pattern_spec" structure from a string.
1097  *                When finished, free with free_pattern_spec().
1098  *
1099  * Parameters  :
1100  *          1  :  pattern = Target pattern_spec to be filled in.
1101  *                          Will be zeroed before use.
1102  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1103  *                      contents of this buffer are destroyed by this
1104  *                      function.  If this function succeeds, the
1105  *                      buffer is copied to pattern->spec.  If this
1106  *                      function fails, the contents of the buffer
1107  *                      are lost forever.
1108  *
1109  * Returns     :  JB_ERR_OK - Success
1110  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1111  *                               written to system log)
1112  *
1113  *********************************************************************/
1114 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1115 {
1116    static const struct
1117    {
1118       /** The tag pattern prefix to match */
1119       const char *prefix;
1120
1121       /** The length of the prefix to match */
1122       const size_t prefix_length;
1123
1124       /** The pattern flag */
1125       const unsigned flag;
1126    } tag_pattern[] = {
1127       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1128       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1129       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1130    };
1131    int i;
1132
1133    assert(pattern);
1134    assert(buf);
1135
1136    memset(pattern, '\0', sizeof(*pattern));
1137
1138    /* Remember the original specification for the CGI pages. */
1139    pattern->spec = strdup_or_die(buf);
1140
1141    /* Check if it's a tag pattern */
1142    for (i = 0; i < SZ(tag_pattern); i++)
1143    {
1144       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1145       {
1146          /* The regex starts after the prefix */
1147          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1148
1149          pattern->flags |= tag_pattern[i].flag;
1150
1151          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1152             &pattern->pattern.tag_regex);
1153       }
1154    }
1155
1156    /* If it isn't a tag pattern it must be an URL pattern. */
1157    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1158
1159    return compile_url_pattern(pattern, buf);
1160
1161 }
1162
1163
1164 /*********************************************************************
1165  *
1166  * Function    :  free_pattern_spec
1167  *
1168  * Description :  Called from the "unloaders".  Freez the pattern
1169  *                structure elements.
1170  *
1171  * Parameters  :
1172  *          1  :  pattern = pointer to a pattern_spec structure.
1173  *
1174  * Returns     :  N/A
1175  *
1176  *********************************************************************/
1177 void free_pattern_spec(struct pattern_spec *pattern)
1178 {
1179    if (pattern == NULL) return;
1180
1181    freez(pattern->spec);
1182 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1183    if (pattern->pattern.url_spec.host_regex)
1184    {
1185       regfree(pattern->pattern.url_spec.host_regex);
1186       freez(pattern->pattern.url_spec.host_regex);
1187    }
1188 #else
1189    freez(pattern->pattern.url_spec.dbuffer);
1190    freez(pattern->pattern.url_spec.dvec);
1191    pattern->pattern.url_spec.dcount = 0;
1192 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1193    freez(pattern->pattern.url_spec.port_list);
1194    if (pattern->pattern.url_spec.preg)
1195    {
1196       regfree(pattern->pattern.url_spec.preg);
1197       freez(pattern->pattern.url_spec.preg);
1198    }
1199    if (pattern->pattern.tag_regex)
1200    {
1201       regfree(pattern->pattern.tag_regex);
1202       freez(pattern->pattern.tag_regex);
1203    }
1204 }
1205
1206
1207 /*********************************************************************
1208  *
1209  * Function    :  port_matches
1210  *
1211  * Description :  Compares a port against a port list.
1212  *
1213  * Parameters  :
1214  *          1  :  port      = The port to check.
1215  *          2  :  port_list = The list of port to compare with.
1216  *
1217  * Returns     :  TRUE for yes, FALSE otherwise.
1218  *
1219  *********************************************************************/
1220 static int port_matches(const int port, const char *port_list)
1221 {
1222    return ((NULL == port_list) || match_portlist(port_list, port));
1223 }
1224
1225
1226 /*********************************************************************
1227  *
1228  * Function    :  host_matches
1229  *
1230  * Description :  Compares a host against a host pattern.
1231  *
1232  * Parameters  :
1233  *          1  :  url = The URL to match
1234  *          2  :  pattern = The URL pattern
1235  *
1236  * Returns     :  TRUE for yes, FALSE otherwise.
1237  *
1238  *********************************************************************/
1239 static int host_matches(const struct http_request *http,
1240                         const struct pattern_spec *pattern)
1241 {
1242 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1243    return ((NULL == pattern->pattern.url_spec.host_regex)
1244       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1245 #else
1246    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1247 #endif
1248 }
1249
1250
1251 /*********************************************************************
1252  *
1253  * Function    :  path_matches
1254  *
1255  * Description :  Compares a path against a path pattern.
1256  *
1257  * Parameters  :
1258  *          1  :  path = The path to match
1259  *          2  :  pattern = The URL pattern
1260  *
1261  * Returns     :  TRUE for yes, FALSE otherwise.
1262  *
1263  *********************************************************************/
1264 static int path_matches(const char *path, const struct pattern_spec *pattern)
1265 {
1266    return ((NULL == pattern->pattern.url_spec.preg)
1267       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1268 }
1269
1270
1271 /*********************************************************************
1272  *
1273  * Function    :  url_match
1274  *
1275  * Description :  Compare a URL against a URL pattern.
1276  *
1277  * Parameters  :
1278  *          1  :  pattern = a URL pattern
1279  *          2  :  url = URL to match
1280  *
1281  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1282  *
1283  *********************************************************************/
1284 int url_match(const struct pattern_spec *pattern,
1285               const struct http_request *http)
1286 {
1287    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1288    {
1289       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1290       return 0;
1291    }
1292
1293    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1294       && host_matches(http, pattern) && path_matches(http->path, pattern));
1295
1296 }
1297
1298
1299 /*********************************************************************
1300  *
1301  * Function    :  match_portlist
1302  *
1303  * Description :  Check if a given number is covered by a comma
1304  *                separated list of numbers and ranges (a,b-c,d,..)
1305  *
1306  * Parameters  :
1307  *          1  :  portlist = String with list
1308  *          2  :  port = port to check
1309  *
1310  * Returns     :  0 => no match
1311  *                1 => match
1312  *
1313  *********************************************************************/
1314 int match_portlist(const char *portlist, int port)
1315 {
1316    char *min, *max, *next, *portlist_copy;
1317
1318    min = portlist_copy = strdup_or_die(portlist);
1319
1320    /*
1321     * Zero-terminate first item and remember offset for next
1322     */
1323    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1324    {
1325       *next++ = '\0';
1326    }
1327
1328    /*
1329     * Loop through all items, checking for match
1330     */
1331    while (NULL != min)
1332    {
1333       if (NULL == (max = strchr(min, (int) '-')))
1334       {
1335          /*
1336           * No dash, check for equality
1337           */
1338          if (port == atoi(min))
1339          {
1340             freez(portlist_copy);
1341             return(1);
1342          }
1343       }
1344       else
1345       {
1346          /*
1347           * This is a range, so check if between min and max,
1348           * or, if max was omitted, between min and 65K
1349           */
1350          *max++ = '\0';
1351          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1352          {
1353             freez(portlist_copy);
1354             return(1);
1355          }
1356
1357       }
1358
1359       /*
1360        * Jump to next item
1361        */
1362       min = next;
1363
1364       /*
1365        * Zero-terminate next item and remember offset for n+1
1366        */
1367       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1368       {
1369          *next++ = '\0';
1370       }
1371    }
1372
1373    freez(portlist_copy);
1374    return 0;
1375
1376 }
1377
1378
1379 /*********************************************************************
1380  *
1381  * Function    :  parse_forwarder_address
1382  *
1383  * Description :  Parse out the host and port from a forwarder address.
1384  *
1385  * Parameters  :
1386  *          1  :  address = The forwarder address to parse.
1387  *          2  :  hostname = Used to return the hostname. NULL on error.
1388  *          3  :  port = Used to return the port. Untouched if no port
1389  *                       is specified.
1390  *
1391  * Returns     :  JB_ERR_OK on success
1392  *                JB_ERR_MEMORY on out of memory
1393  *                JB_ERR_PARSE on malformed address.
1394  *
1395  *********************************************************************/
1396 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1397 {
1398    char *p = address;
1399
1400    if ((*address == '[') && (NULL == strchr(address, ']')))
1401    {
1402       /* XXX: Should do some more validity checks here. */
1403       return JB_ERR_PARSE;
1404    }
1405
1406    *hostname = strdup_or_die(address);
1407
1408    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1409    {
1410       *p++ = '\0';
1411       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1412       if (*p == ':')
1413       {
1414          *port = (int)strtol(++p, NULL, 0);
1415       }
1416    }
1417    else if (NULL != (p = strchr(*hostname, ':')))
1418    {
1419       *p++ = '\0';
1420       *port = (int)strtol(p, NULL, 0);
1421    }
1422
1423    return JB_ERR_OK;
1424
1425 }
1426
1427
1428 /*
1429   Local Variables:
1430   tab-width: 3
1431   end:
1432 */