urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.74 2012/12/07 12:49:20 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2011
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457    };
 458    int i;
 459
 460    for (i = 0; i < SZ(known_http_methods); i++)
 461    {
 462       if (0 == strcmpic(method, known_http_methods[i]))
 463       {
 464          return FALSE;
 465       }
 466    }
 467
 468    return TRUE;
 469
 470 }
 471
 472
 473 /*********************************************************************
 474  *
 475  * Function    :  parse_http_request
 476  *
 477  * Description :  Parse out the host and port from the URL.  Find the
 478  *                hostname & path, port (if ':'), and/or password (if '@')
 479  *
 480  * Parameters  :
 481  *          1  :  req = HTTP request line to break down
 482  *          2  :  http = pointer to the http structure to hold elements
 483  *
 484  * Returns     :  JB_ERR_OK on success
 485  *                JB_ERR_CGI_PARAMS on malformed command/URL
 486  *                                  or >100 domains deep.
 487  *
 488  *********************************************************************/
 489 jb_err parse_http_request(const char *req, struct http_request *http)
 490 {
 491    char *buf;
 492    char *v[10]; /* XXX: Why 10? We should only need three. */
 493    int n;
 494    jb_err err;
 495
 496    memset(http, '\0', sizeof(*http));
 497
 498    buf = strdup_or_die(req);
 499
 500    n = ssplit(buf, " \r\n", v, SZ(v));
 501    if (n != 3)
 502    {
 503       freez(buf);
 504       return JB_ERR_PARSE;
 505    }
 506
 507    /*
 508     * Fail in case of unknown methods
 509     * which we might not handle correctly.
 510     *
 511     * XXX: There should be a config option
 512     * to forward requests with unknown methods
 513     * anyway. Most of them don't need special
 514     * steps.
 515     */
 516    if (unknown_method(v[0]))
 517    {
 518       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 519       freez(buf);
 520       return JB_ERR_PARSE;
 521    }
 522
 523    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 524    {
 525       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 526          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 527       freez(buf);
 528       return JB_ERR_PARSE;
 529    }
 530
 531    http->ssl = !strcmpic(v[0], "CONNECT");
 532
 533    err = parse_http_url(v[1], http, !http->ssl);
 534    if (err)
 535    {
 536       freez(buf);
 537       return err;
 538    }
 539
 540    /*
 541     * Copy the details into the structure
 542     */
 543    http->cmd = strdup_or_die(req);
 544    http->gpc = strdup_or_die(v[0]);
 545    http->ver = strdup_or_die(v[2]);
 546
 547    freez(buf);
 548
 549    return JB_ERR_OK;
 550
 551 }
 552
 553
 554 /*********************************************************************
 555  *
 556  * Function    :  compile_pattern
 557  *
 558  * Description :  Compiles a host, domain or TAG pattern.
 559  *
 560  * Parameters  :
 561  *          1  :  pattern = The pattern to compile.
 562  *          2  :  anchoring = How the regex should be modified
 563  *                            before compilation. Can be either
 564  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 565  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 566  *          3  :  url     = In case of failures, the spec member is
 567  *                          logged and the structure freed.
 568  *          4  :  regex   = Where the compiled regex should be stored.
 569  *
 570  * Returns     :  JB_ERR_OK - Success
 571  *                JB_ERR_MEMORY - Out of memory
 572  *                JB_ERR_PARSE - Cannot parse regex
 573  *
 574  *********************************************************************/
 575 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 576                               struct url_spec *url, regex_t **regex)
 577 {
 578    int errcode;
 579    char rebuf[BUFFER_SIZE];
 580    const char *fmt = NULL;
 581
 582    assert(pattern);
 583    assert(strlen(pattern) < sizeof(rebuf) - 2);
 584
 585    if (pattern[0] == '\0')
 586    {
 587       *regex = NULL;
 588       return JB_ERR_OK;
 589    }
 590
 591    switch (anchoring)
 592    {
 593       case NO_ANCHORING:
 594          fmt = "%s";
 595          break;
 596       case RIGHT_ANCHORED:
 597          fmt = "%s$";
 598          break;
 599       case RIGHT_ANCHORED_HOST:
 600          fmt = "%s\\.?$";
 601          break;
 602       case LEFT_ANCHORED:
 603          fmt = "^%s";
 604          break;
 605       default:
 606          log_error(LOG_LEVEL_FATAL,
 607             "Invalid anchoring in compile_pattern %d", anchoring);
 608    }
 609
 610    *regex = zalloc(sizeof(**regex));
 611    if (NULL == *regex)
 612    {
 613       free_url_spec(url);
 614       return JB_ERR_MEMORY;
 615    }
 616
 617    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 618
 619    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 620
 621    if (errcode)
 622    {
 623       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 624       if (errlen > (sizeof(rebuf) - (size_t)1))
 625       {
 626          errlen = sizeof(rebuf) - (size_t)1;
 627       }
 628       rebuf[errlen] = '\0';
 629       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 630          pattern, url->spec, rebuf);
 631       free_url_spec(url);
 632
 633       return JB_ERR_PARSE;
 634    }
 635
 636    return JB_ERR_OK;
 637
 638 }
 639
 640
 641 /*********************************************************************
 642  *
 643  * Function    :  compile_url_pattern
 644  *
 645  * Description :  Compiles the three parts of an URL pattern.
 646  *
 647  * Parameters  :
 648  *          1  :  url = Target url_spec to be filled in.
 649  *          2  :  buf = The url pattern to compile. Will be messed up.
 650  *
 651  * Returns     :  JB_ERR_OK - Success
 652  *                JB_ERR_MEMORY - Out of memory
 653  *                JB_ERR_PARSE - Cannot parse regex
 654  *
 655  *********************************************************************/
 656 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 657 {
 658    char *p;
 659
 660    p = strchr(buf, '/');
 661    if (NULL != p)
 662    {
 663       /*
 664        * Only compile the regex if it consists of more than
 665        * a single slash, otherwise it wouldn't affect the result.
 666        */
 667       if (p[1] != '\0')
 668       {
 669          /*
 670           * XXX: does it make sense to compile the slash at the beginning?
 671           */
 672          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 673
 674          if (JB_ERR_OK != err)
 675          {
 676             return err;
 677          }
 678       }
 679       *p = '\0';
 680    }
 681
 682    /*
 683     * IPv6 numeric hostnames can contain colons, thus we need
 684     * to delimit the hostname before the real port separator.
 685     * As brackets are already used in the hostname pattern,
 686     * we use angle brackets ('<', '>') instead.
 687     */
 688    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 689    {
 690       *p++ = '\0';
 691       buf++;
 692
 693       if (*p == '\0')
 694       {
 695          /* IPv6 address without port number */
 696          p = NULL;
 697       }
 698       else if (*p != ':')
 699       {
 700          /* Garbage after address delimiter */
 701          return JB_ERR_PARSE;
 702       }
 703    }
 704    else
 705    {
 706       p = strchr(buf, ':');
 707    }
 708
 709    if (NULL != p)
 710    {
 711       *p++ = '\0';
 712       url->port_list = strdup_or_die(p);
 713    }
 714    else
 715    {
 716       url->port_list = NULL;
 717    }
 718
 719    if (buf[0] != '\0')
 720    {
 721       return compile_host_pattern(url, buf);
 722    }
 723
 724    return JB_ERR_OK;
 725
 726 }
 727
 728
 729 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 730 /*********************************************************************
 731  *
 732  * Function    :  compile_host_pattern
 733  *
 734  * Description :  Parses and compiles a host pattern.
 735  *
 736  * Parameters  :
 737  *          1  :  url = Target url_spec to be filled in.
 738  *          2  :  host_pattern = Host pattern to compile.
 739  *
 740  * Returns     :  JB_ERR_OK - Success
 741  *                JB_ERR_MEMORY - Out of memory
 742  *                JB_ERR_PARSE - Cannot parse regex
 743  *
 744  *********************************************************************/
 745 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 746 {
 747    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 748 }
 749
 750 #else
 751
 752 /*********************************************************************
 753  *
 754  * Function    :  compile_host_pattern
 755  *
 756  * Description :  Parses and "compiles" an old-school host pattern.
 757  *
 758  * Parameters  :
 759  *          1  :  url = Target url_spec to be filled in.
 760  *          2  :  host_pattern = Host pattern to parse.
 761  *
 762  * Returns     :  JB_ERR_OK - Success
 763  *                JB_ERR_PARSE - Cannot parse regex
 764  *
 765  *********************************************************************/
 766 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 767 {
 768    char *v[150];
 769    size_t size;
 770    char *p;
 771
 772    /*
 773     * Parse domain part
 774     */
 775    if (host_pattern[strlen(host_pattern) - 1] == '.')
 776    {
 777       url->unanchored |= ANCHOR_RIGHT;
 778    }
 779    if (host_pattern[0] == '.')
 780    {
 781       url->unanchored |= ANCHOR_LEFT;
 782    }
 783
 784    /*
 785     * Split domain into components
 786     */
 787    url->dbuffer = strdup_or_die(host_pattern);
 788
 789    /*
 790     * Map to lower case
 791     */
 792    for (p = url->dbuffer; *p ; p++)
 793    {
 794       *p = (char)privoxy_tolower(*p);
 795    }
 796
 797    /*
 798     * Split the domain name into components
 799     */
 800    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v));
 801
 802    if (url->dcount < 0)
 803    {
 804       free_url_spec(url);
 805       return JB_ERR_PARSE;
 806    }
 807    else if (url->dcount != 0)
 808    {
 809       /*
 810        * Save a copy of the pointers in dvec
 811        */
 812       size = (size_t)url->dcount * sizeof(*url->dvec);
 813
 814       url->dvec = malloc_or_die(size);
 815
 816       memcpy(url->dvec, v, size);
 817    }
 818    /*
 819     * else dcount == 0 in which case we needn't do anything,
 820     * since dvec will never be accessed and the pattern will
 821     * match all domains.
 822     */
 823    return JB_ERR_OK;
 824 }
 825
 826
 827 /*********************************************************************
 828  *
 829  * Function    :  simplematch
 830  *
 831  * Description :  String matching, with a (greedy) '*' wildcard that
 832  *                stands for zero or more arbitrary characters and
 833  *                character classes in [], which take both enumerations
 834  *                and ranges.
 835  *
 836  * Parameters  :
 837  *          1  :  pattern = pattern for matching
 838  *          2  :  text    = text to be matched
 839  *
 840  * Returns     :  0 if match, else nonzero
 841  *
 842  *********************************************************************/
 843 static int simplematch(const char *pattern, const char *text)
 844 {
 845    const unsigned char *pat = (const unsigned char *)pattern;
 846    const unsigned char *txt = (const unsigned char *)text;
 847    const unsigned char *fallback = pat;
 848    int wildcard = 0;
 849
 850    unsigned char lastchar = 'a';
 851    unsigned i;
 852    unsigned char charmap[32];
 853
 854    while (*txt)
 855    {
 856
 857       /* EOF pattern but !EOF text? */
 858       if (*pat == '\0')
 859       {
 860          if (wildcard)
 861          {
 862             pat = fallback;
 863          }
 864          else
 865          {
 866             return 1;
 867          }
 868       }
 869
 870       /* '*' in the pattern?  */
 871       if (*pat == '*')
 872       {
 873
 874          /* The pattern ends afterwards? Speed up the return. */
 875          if (*++pat == '\0')
 876          {
 877             return 0;
 878          }
 879
 880          /* Else, set wildcard mode and remember position after '*' */
 881          wildcard = 1;
 882          fallback = pat;
 883       }
 884
 885       /* Character range specification? */
 886       if (*pat == '[')
 887       {
 888          memset(charmap, '\0', sizeof(charmap));
 889
 890          while (*++pat != ']')
 891          {
 892             if (!*pat)
 893             {
 894                return 1;
 895             }
 896             else if (*pat == '-')
 897             {
 898                if ((*++pat == ']') || *pat == '\0')
 899                {
 900                   return(1);
 901                }
 902                for (i = lastchar; i <= *pat; i++)
 903                {
 904                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 905                }
 906             }
 907             else
 908             {
 909                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 910                lastchar = *pat;
 911             }
 912          }
 913       } /* -END- if Character range specification */
 914
 915
 916       /*
 917        * Char match, or char range match?
 918        */
 919       if ((*pat == *txt)
 920        || (*pat == '?')
 921        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 922       {
 923          /*
 924           * Success: Go ahead
 925           */
 926          pat++;
 927       }
 928       else if (!wildcard)
 929       {
 930          /*
 931           * No match && no wildcard: No luck
 932           */
 933          return 1;
 934       }
 935       else if (pat != fallback)
 936       {
 937          /*
 938           * Increment text pointer if in char range matching
 939           */
 940          if (*pat == ']')
 941          {
 942             txt++;
 943          }
 944          /*
 945           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 946           */
 947          pat = fallback;
 948          /*
 949           * Restart matching from current text pointer
 950           */
 951          continue;
 952       }
 953       txt++;
 954    }
 955
 956    /* Cut off extra '*'s */
 957    if (*pat == '*') pat++;
 958
 959    /* If this is the pattern's end, fine! */
 960    return(*pat);
 961
 962 }
 963
 964
 965 /*********************************************************************
 966  *
 967  * Function    :  simple_domaincmp
 968  *
 969  * Description :  Domain-wise Compare fqdn's.  The comparison is
 970  *                both left- and right-anchored.  The individual
 971  *                domain names are compared with simplematch().
 972  *                This is only used by domain_match.
 973  *
 974  * Parameters  :
 975  *          1  :  pv = array of patterns to compare
 976  *          2  :  fv = array of domain components to compare
 977  *          3  :  len = length of the arrays (both arrays are the
 978  *                      same length - if they weren't, it couldn't
 979  *                      possibly be a match).
 980  *
 981  * Returns     :  0 => domains are equivalent, else no match.
 982  *
 983  *********************************************************************/
 984 static int simple_domaincmp(char **pv, char **fv, int len)
 985 {
 986    int n;
 987
 988    for (n = 0; n < len; n++)
 989    {
 990       if (simplematch(pv[n], fv[n]))
 991       {
 992          return 1;
 993       }
 994    }
 995
 996    return 0;
 997
 998 }
 999
1000
1001 /*********************************************************************
1002  *
1003  * Function    :  domain_match
1004  *
1005  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1006  *                pattern->unachored, the comparison is un-, left-,
1007  *                right-anchored, or both.
1008  *                The individual domain names are compared with
1009  *                simplematch().
1010  *
1011  * Parameters  :
1012  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1013  *          2  :  fqdn = domain name against which the patterns are compared.
1014  *
1015  * Returns     :  0 => domains are equivalent, else no match.
1016  *
1017  *********************************************************************/
1018 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1019 {
1020    char **pv, **fv;  /* vectors  */
1021    int    plen, flen;
1022    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1023
1024    plen = pattern->dcount;
1025    flen = fqdn->dcount;
1026
1027    if (flen < plen)
1028    {
1029       /* fqdn is too short to match this pattern */
1030       return 1;
1031    }
1032
1033    pv   = pattern->dvec;
1034    fv   = fqdn->dvec;
1035
1036    if (unanchored == ANCHOR_LEFT)
1037    {
1038       /*
1039        * Right anchored.
1040        *
1041        * Convert this into a fully anchored pattern with
1042        * the fqdn and pattern the same length
1043        */
1044       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1045       return simple_domaincmp(pv, fv, plen);
1046    }
1047    else if (unanchored == 0)
1048    {
1049       /* Fully anchored, check length */
1050       if (flen != plen)
1051       {
1052          return 1;
1053       }
1054       return simple_domaincmp(pv, fv, plen);
1055    }
1056    else if (unanchored == ANCHOR_RIGHT)
1057    {
1058       /* Left anchored, ignore all extra in fqdn */
1059       return simple_domaincmp(pv, fv, plen);
1060    }
1061    else
1062    {
1063       /* Unanchored */
1064       int n;
1065       int maxn = flen - plen;
1066       for (n = 0; n <= maxn; n++)
1067       {
1068          if (!simple_domaincmp(pv, fv, plen))
1069          {
1070             return 0;
1071          }
1072          /*
1073           * Doesn't match from start of fqdn
1074           * Try skipping first part of fqdn
1075           */
1076          fv++;
1077       }
1078       return 1;
1079    }
1080
1081 }
1082 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1083
1084
1085 /*********************************************************************
1086  *
1087  * Function    :  create_url_spec
1088  *
1089  * Description :  Creates a "url_spec" structure from a string.
1090  *                When finished, free with free_url_spec().
1091  *
1092  * Parameters  :
1093  *          1  :  url = Target url_spec to be filled in.  Will be
1094  *                      zeroed before use.
1095  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1096  *                      contents of this buffer are destroyed by this
1097  *                      function.  If this function succeeds, the
1098  *                      buffer is copied to url->spec.  If this
1099  *                      function fails, the contents of the buffer
1100  *                      are lost forever.
1101  *
1102  * Returns     :  JB_ERR_OK - Success
1103  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1104  *                               written to system log)
1105  *
1106  *********************************************************************/
1107 jb_err create_url_spec(struct url_spec *url, char *buf)
1108 {
1109    assert(url);
1110    assert(buf);
1111
1112    memset(url, '\0', sizeof(*url));
1113
1114    /* Remember the original specification for the CGI pages. */
1115    url->spec = strdup_or_die(buf);
1116
1117    /* Is it a tag pattern? */
1118    if (0 == strncmpic(url->spec, "TAG:", 4))
1119    {
1120       /* The pattern starts with the first character after "TAG:" */
1121       const char *tag_pattern = buf + 4;
1122       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1123    }
1124
1125    /* If it isn't a tag pattern it must be an URL pattern. */
1126    return compile_url_pattern(url, buf);
1127 }
1128
1129
1130 /*********************************************************************
1131  *
1132  * Function    :  free_url_spec
1133  *
1134  * Description :  Called from the "unloaders".  Freez the url
1135  *                structure elements.
1136  *
1137  * Parameters  :
1138  *          1  :  url = pointer to a url_spec structure.
1139  *
1140  * Returns     :  N/A
1141  *
1142  *********************************************************************/
1143 void free_url_spec(struct url_spec *url)
1144 {
1145    if (url == NULL) return;
1146
1147    freez(url->spec);
1148 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1149    if (url->host_regex)
1150    {
1151       regfree(url->host_regex);
1152       freez(url->host_regex);
1153    }
1154 #else
1155    freez(url->dbuffer);
1156    freez(url->dvec);
1157    url->dcount = 0;
1158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1159    freez(url->port_list);
1160    if (url->preg)
1161    {
1162       regfree(url->preg);
1163       freez(url->preg);
1164    }
1165    if (url->tag_regex)
1166    {
1167       regfree(url->tag_regex);
1168       freez(url->tag_regex);
1169    }
1170 }
1171
1172
1173 /*********************************************************************
1174  *
1175  * Function    :  port_matches
1176  *
1177  * Description :  Compares a port against a port list.
1178  *
1179  * Parameters  :
1180  *          1  :  port      = The port to check.
1181  *          2  :  port_list = The list of port to compare with.
1182  *
1183  * Returns     :  TRUE for yes, FALSE otherwise.
1184  *
1185  *********************************************************************/
1186 static int port_matches(const int port, const char *port_list)
1187 {
1188    return ((NULL == port_list) || match_portlist(port_list, port));
1189 }
1190
1191
1192 /*********************************************************************
1193  *
1194  * Function    :  host_matches
1195  *
1196  * Description :  Compares a host against a host pattern.
1197  *
1198  * Parameters  :
1199  *          1  :  url = The URL to match
1200  *          2  :  pattern = The URL pattern
1201  *
1202  * Returns     :  TRUE for yes, FALSE otherwise.
1203  *
1204  *********************************************************************/
1205 static int host_matches(const struct http_request *http,
1206                         const struct url_spec *pattern)
1207 {
1208 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1209    return ((NULL == pattern->host_regex)
1210       || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
1211 #else
1212    return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
1213 #endif
1214 }
1215
1216
1217 /*********************************************************************
1218  *
1219  * Function    :  path_matches
1220  *
1221  * Description :  Compares a path against a path pattern.
1222  *
1223  * Parameters  :
1224  *          1  :  path = The path to match
1225  *          2  :  pattern = The URL pattern
1226  *
1227  * Returns     :  TRUE for yes, FALSE otherwise.
1228  *
1229  *********************************************************************/
1230 static int path_matches(const char *path, const struct url_spec *pattern)
1231 {
1232    return ((NULL == pattern->preg)
1233       || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
1234 }
1235
1236
1237 /*********************************************************************
1238  *
1239  * Function    :  url_match
1240  *
1241  * Description :  Compare a URL against a URL pattern.
1242  *
1243  * Parameters  :
1244  *          1  :  pattern = a URL pattern
1245  *          2  :  url = URL to match
1246  *
1247  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1248  *
1249  *********************************************************************/
1250 int url_match(const struct url_spec *pattern,
1251               const struct http_request *http)
1252 {
1253    if (pattern->tag_regex != NULL)
1254    {
1255       /* It's a tag pattern and shouldn't be matched against URLs */
1256       return 0;
1257    }
1258
1259    return (port_matches(http->port, pattern->port_list)
1260       && host_matches(http, pattern) && path_matches(http->path, pattern));
1261
1262 }
1263
1264
1265 /*********************************************************************
1266  *
1267  * Function    :  match_portlist
1268  *
1269  * Description :  Check if a given number is covered by a comma
1270  *                separated list of numbers and ranges (a,b-c,d,..)
1271  *
1272  * Parameters  :
1273  *          1  :  portlist = String with list
1274  *          2  :  port = port to check
1275  *
1276  * Returns     :  0 => no match
1277  *                1 => match
1278  *
1279  *********************************************************************/
1280 int match_portlist(const char *portlist, int port)
1281 {
1282    char *min, *max, *next, *portlist_copy;
1283
1284    min = portlist_copy = strdup_or_die(portlist);
1285
1286    /*
1287     * Zero-terminate first item and remember offset for next
1288     */
1289    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1290    {
1291       *next++ = '\0';
1292    }
1293
1294    /*
1295     * Loop through all items, checking for match
1296     */
1297    while (NULL != min)
1298    {
1299       if (NULL == (max = strchr(min, (int) '-')))
1300       {
1301          /*
1302           * No dash, check for equality
1303           */
1304          if (port == atoi(min))
1305          {
1306             freez(portlist_copy);
1307             return(1);
1308          }
1309       }
1310       else
1311       {
1312          /*
1313           * This is a range, so check if between min and max,
1314           * or, if max was omitted, between min and 65K
1315           */
1316          *max++ = '\0';
1317          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1318          {
1319             freez(portlist_copy);
1320             return(1);
1321          }
1322
1323       }
1324
1325       /*
1326        * Jump to next item
1327        */
1328       min = next;
1329
1330       /*
1331        * Zero-terminate next item and remember offset for n+1
1332        */
1333       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1334       {
1335          *next++ = '\0';
1336       }
1337    }
1338
1339    freez(portlist_copy);
1340    return 0;
1341
1342 }
1343
1344
1345 /*********************************************************************
1346  *
1347  * Function    :  parse_forwarder_address
1348  *
1349  * Description :  Parse out the host and port from a forwarder address.
1350  *
1351  * Parameters  :
1352  *          1  :  address = The forwarder address to parse.
1353  *          2  :  hostname = Used to return the hostname. NULL on error.
1354  *          3  :  port = Used to return the port. Untouched if no port
1355  *                       is specified.
1356  *
1357  * Returns     :  JB_ERR_OK on success
1358  *                JB_ERR_MEMORY on out of memory
1359  *                JB_ERR_PARSE on malformed address.
1360  *
1361  *********************************************************************/
1362 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1363 {
1364    char *p = address;
1365
1366    if ((*address == '[') && (NULL == strchr(address, ']')))
1367    {
1368       /* XXX: Should do some more validity checks here. */
1369       return JB_ERR_PARSE;
1370    }
1371
1372    *hostname = strdup_or_die(address);
1373
1374    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1375    {
1376       *p++ = '\0';
1377       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1378       if (*p == ':')
1379       {
1380          *port = (int)strtol(++p, NULL, 0);
1381       }
1382    }
1383    else if (NULL != (p = strchr(*hostname, ':')))
1384    {
1385       *p++ = '\0';
1386       *port = (int)strtol(p, NULL, 0);
1387    }
1388
1389    return JB_ERR_OK;
1390
1391 }
1392
1393
1394 /*
1395   Local Variables:
1396   tab-width: 3
1397   end:
1398 */