urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.66 2011/12/31 14:53:18 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2011
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_MEMORY on out of memory
 119  *                JB_ERR_PARSE on malformed command/URL
 120  *                             or >100 domains deep.
 121  *
 122  *********************************************************************/
 123 jb_err init_domain_components(struct http_request *http)
 124 {
 125    char *vec[BUFFER_SIZE];
 126    size_t size;
 127    char *p;
 128
 129    http->dbuffer = strdup(http->host);
 130    if (NULL == http->dbuffer)
 131    {
 132       return JB_ERR_MEMORY;
 133    }
 134
 135    /* map to lower case */
 136    for (p = http->dbuffer; *p ; p++)
 137    {
 138       *p = (char)privoxy_tolower(*p);
 139    }
 140
 141    /* split the domain name into components */
 142    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 143
 144    if (http->dcount <= 0)
 145    {
 146       /*
 147        * Error: More than SZ(vec) components in domain
 148        *    or: no components in domain
 149        */
 150       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 151       return JB_ERR_PARSE;
 152    }
 153
 154    /* save a copy of the pointers in dvec */
 155    size = (size_t)http->dcount * sizeof(*http->dvec);
 156
 157    http->dvec = (char **)malloc(size);
 158    if (NULL == http->dvec)
 159    {
 160       return JB_ERR_MEMORY;
 161    }
 162
 163    memcpy(http->dvec, vec, size);
 164
 165    return JB_ERR_OK;
 166 }
 167 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 168
 169
 170 /*********************************************************************
 171  *
 172  * Function    :  url_requires_percent_encoding
 173  *
 174  * Description :  Checks if an URL contains invalid characters
 175  *                according to RFC 3986 that should be percent-encoded.
 176  *                Does not verify whether or not the passed string
 177  *                actually is a valid URL.
 178  *
 179  * Parameters  :
 180  *          1  :  url = URL to check
 181  *
 182  * Returns     :  True in case of valid URLs, false otherwise
 183  *
 184  *********************************************************************/
 185 int url_requires_percent_encoding(const char *url)
 186 {
 187    static const char allowed_characters[128] = {
 188       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 189       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 190       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 191       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 192       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 193       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 194       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 195       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 196       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 197       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 198       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 199       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 200       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 201    };
 202
 203    while (*url != '\0')
 204    {
 205       const unsigned int i = (unsigned char)*url++;
 206       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 207       {
 208          return TRUE;
 209       }
 210    }
 211
 212    return FALSE;
 213
 214 }
 215
 216
 217 /*********************************************************************
 218  *
 219  * Function    :  parse_http_url
 220  *
 221  * Description :  Parse out the host and port from the URL.  Find the
 222  *                hostname & path, port (if ':'), and/or password (if '@')
 223  *
 224  * Parameters  :
 225  *          1  :  url = URL (or is it URI?) to break down
 226  *          2  :  http = pointer to the http structure to hold elements.
 227  *                       Must be initialized with valid values (like NULLs).
 228  *          3  :  require_protocol = Whether or not URLs without
 229  *                                   protocol are acceptable.
 230  *
 231  * Returns     :  JB_ERR_OK on success
 232  *                JB_ERR_MEMORY on out of memory
 233  *                JB_ERR_PARSE on malformed command/URL
 234  *                             or >100 domains deep.
 235  *
 236  *********************************************************************/
 237 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 238 {
 239    int host_available = 1; /* A proxy can dream. */
 240
 241    /*
 242     * Save our initial URL
 243     */
 244    http->url = strdup(url);
 245    if (http->url == NULL)
 246    {
 247       return JB_ERR_MEMORY;
 248    }
 249
 250
 251    /*
 252     * Check for * URI. If found, we're done.
 253     */
 254    if (*http->url == '*')
 255    {
 256       if  ( NULL == (http->path = strdup("*"))
 257          || NULL == (http->hostport = strdup("")) )
 258       {
 259          return JB_ERR_MEMORY;
 260       }
 261       if (http->url[1] != '\0')
 262       {
 263          return JB_ERR_PARSE;
 264       }
 265       return JB_ERR_OK;
 266    }
 267
 268
 269    /*
 270     * Split URL into protocol,hostport,path.
 271     */
 272    {
 273       char *buf;
 274       char *url_noproto;
 275       char *url_path;
 276
 277       buf = strdup(url);
 278       if (buf == NULL)
 279       {
 280          return JB_ERR_MEMORY;
 281       }
 282
 283       /* Find the start of the URL in our scratch space */
 284       url_noproto = buf;
 285       if (strncmpic(url_noproto, "http://",  7) == 0)
 286       {
 287          url_noproto += 7;
 288       }
 289       else if (strncmpic(url_noproto, "https://", 8) == 0)
 290       {
 291          /*
 292           * Should only happen when called from cgi_show_url_info().
 293           */
 294          url_noproto += 8;
 295          http->ssl = 1;
 296       }
 297       else if (*url_noproto == '/')
 298       {
 299         /*
 300          * Short request line without protocol and host.
 301          * Most likely because the client's request
 302          * was intercepted and redirected into Privoxy.
 303          */
 304          http->host = NULL;
 305          host_available = 0;
 306       }
 307       else if (require_protocol)
 308       {
 309          freez(buf);
 310          return JB_ERR_PARSE;
 311       }
 312
 313       url_path = strchr(url_noproto, '/');
 314       if (url_path != NULL)
 315       {
 316          /*
 317           * Got a path.
 318           *
 319           * NOTE: The following line ignores the path for HTTPS URLS.
 320           * This means that you get consistent behaviour if you type a
 321           * https URL in and it's parsed by the function.  (When the
 322           * URL is actually retrieved, SSL hides the path part).
 323           */
 324          http->path = strdup(http->ssl ? "/" : url_path);
 325          *url_path = '\0';
 326          http->hostport = strdup(url_noproto);
 327       }
 328       else
 329       {
 330          /*
 331           * Repair broken HTTP requests that don't contain a path,
 332           * or CONNECT requests
 333           */
 334          http->path = strdup("/");
 335          http->hostport = strdup(url_noproto);
 336       }
 337
 338       freez(buf);
 339
 340       if ( (http->path == NULL)
 341         || (http->hostport == NULL))
 342       {
 343          return JB_ERR_MEMORY;
 344       }
 345    }
 346
 347    if (!host_available)
 348    {
 349       /* Without host, there is nothing left to do here */
 350       return JB_ERR_OK;
 351    }
 352
 353    /*
 354     * Split hostport into user/password (ignored), host, port.
 355     */
 356    {
 357       char *buf;
 358       char *host;
 359       char *port;
 360
 361       buf = strdup(http->hostport);
 362       if (buf == NULL)
 363       {
 364          return JB_ERR_MEMORY;
 365       }
 366
 367       /* check if url contains username and/or password */
 368       host = strchr(buf, '@');
 369       if (host != NULL)
 370       {
 371          /* Contains username/password, skip it and the @ sign. */
 372          host++;
 373       }
 374       else
 375       {
 376          /* No username or password. */
 377          host = buf;
 378       }
 379
 380       /* Move after hostname before port number */
 381       if (*host == '[')
 382       {
 383          /* Numeric IPv6 address delimited by brackets */
 384          host++;
 385          port = strchr(host, ']');
 386
 387          if (port == NULL)
 388          {
 389             /* Missing closing bracket */
 390             freez(buf);
 391             return JB_ERR_PARSE;
 392          }
 393
 394          *port++ = '\0';
 395
 396          if (*port == '\0')
 397          {
 398             port = NULL;
 399          }
 400          else if (*port != ':')
 401          {
 402             /* Garbage after closing bracket */
 403             freez(buf);
 404             return JB_ERR_PARSE;
 405          }
 406       }
 407       else
 408       {
 409          /* Plain non-escaped hostname */
 410          port = strchr(host, ':');
 411       }
 412
 413       /* check if url contains port */
 414       if (port != NULL)
 415       {
 416          /* Contains port */
 417          /* Terminate hostname and point to start of port string */
 418          *port++ = '\0';
 419          http->port = atoi(port);
 420       }
 421       else
 422       {
 423          /* No port specified. */
 424          http->port = (http->ssl ? 443 : 80);
 425       }
 426
 427       http->host = strdup(host);
 428
 429       freez(buf);
 430
 431       if (http->host == NULL)
 432       {
 433          return JB_ERR_MEMORY;
 434       }
 435    }
 436
 437 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 438    return JB_ERR_OK;
 439 #else
 440    /* Split domain name so we can compare it against wildcards */
 441    return init_domain_components(http);
 442 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 443
 444 }
 445
 446
 447 /*********************************************************************
 448  *
 449  * Function    :  unknown_method
 450  *
 451  * Description :  Checks whether a method is unknown.
 452  *
 453  * Parameters  :
 454  *          1  :  method = points to a http method
 455  *
 456  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 457  *
 458  *********************************************************************/
 459 static int unknown_method(const char *method)
 460 {
 461    static const char * const known_http_methods[] = {
 462       /* Basic HTTP request type */
 463       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 464       /* webDAV extensions (RFC2518) */
 465       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 466       /*
 467        * Microsoft webDAV extension for Exchange 2000.  See:
 468        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 469        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 470        */
 471       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 472       /*
 473        * Another Microsoft webDAV extension for Exchange 2000.  See:
 474        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 475        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 476        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 477        */
 478       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 479       /*
 480        * Yet another WebDAV extension, this time for
 481        * Web Distributed Authoring and Versioning (RFC3253)
 482        */
 483       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 484       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 485    };
 486    int i;
 487
 488    for (i = 0; i < SZ(known_http_methods); i++)
 489    {
 490       if (0 == strcmpic(method, known_http_methods[i]))
 491       {
 492          return FALSE;
 493       }
 494    }
 495
 496    return TRUE;
 497
 498 }
 499
 500
 501 /*********************************************************************
 502  *
 503  * Function    :  parse_http_request
 504  *
 505  * Description :  Parse out the host and port from the URL.  Find the
 506  *                hostname & path, port (if ':'), and/or password (if '@')
 507  *
 508  * Parameters  :
 509  *          1  :  req = HTTP request line to break down
 510  *          2  :  http = pointer to the http structure to hold elements
 511  *
 512  * Returns     :  JB_ERR_OK on success
 513  *                JB_ERR_MEMORY on out of memory
 514  *                JB_ERR_CGI_PARAMS on malformed command/URL
 515  *                                  or >100 domains deep.
 516  *
 517  *********************************************************************/
 518 jb_err parse_http_request(const char *req, struct http_request *http)
 519 {
 520    char *buf;
 521    char *v[10]; /* XXX: Why 10? We should only need three. */
 522    int n;
 523    jb_err err;
 524
 525    memset(http, '\0', sizeof(*http));
 526
 527    buf = strdup(req);
 528    if (buf == NULL)
 529    {
 530       return JB_ERR_MEMORY;
 531    }
 532
 533    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 534    if (n != 3)
 535    {
 536       freez(buf);
 537       return JB_ERR_PARSE;
 538    }
 539
 540    /*
 541     * Fail in case of unknown methods
 542     * which we might not handle correctly.
 543     *
 544     * XXX: There should be a config option
 545     * to forward requests with unknown methods
 546     * anyway. Most of them don't need special
 547     * steps.
 548     */
 549    if (unknown_method(v[0]))
 550    {
 551       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 552       freez(buf);
 553       return JB_ERR_PARSE;
 554    }
 555
 556    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 557    {
 558       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 559          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 560       freez(buf);
 561       return JB_ERR_PARSE;
 562    }
 563
 564    http->ssl = !strcmpic(v[0], "CONNECT");
 565
 566    err = parse_http_url(v[1], http, !http->ssl);
 567    if (err)
 568    {
 569       freez(buf);
 570       return err;
 571    }
 572
 573    /*
 574     * Copy the details into the structure
 575     */
 576    http->cmd = strdup(req);
 577    http->gpc = strdup(v[0]);
 578    http->ver = strdup(v[2]);
 579
 580    freez(buf);
 581
 582    if ( (http->cmd == NULL)
 583      || (http->gpc == NULL)
 584      || (http->ver == NULL) )
 585    {
 586       return JB_ERR_MEMORY;
 587    }
 588
 589    return JB_ERR_OK;
 590
 591 }
 592
 593
 594 /*********************************************************************
 595  *
 596  * Function    :  compile_pattern
 597  *
 598  * Description :  Compiles a host, domain or TAG pattern.
 599  *
 600  * Parameters  :
 601  *          1  :  pattern = The pattern to compile.
 602  *          2  :  anchoring = How the regex should be modified
 603  *                            before compilation. Can be either
 604  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 605  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 606  *          3  :  url     = In case of failures, the spec member is
 607  *                          logged and the structure freed.
 608  *          4  :  regex   = Where the compiled regex should be stored.
 609  *
 610  * Returns     :  JB_ERR_OK - Success
 611  *                JB_ERR_MEMORY - Out of memory
 612  *                JB_ERR_PARSE - Cannot parse regex
 613  *
 614  *********************************************************************/
 615 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 616                               struct url_spec *url, regex_t **regex)
 617 {
 618    int errcode;
 619    char rebuf[BUFFER_SIZE];
 620    const char *fmt = NULL;
 621
 622    assert(pattern);
 623    assert(strlen(pattern) < sizeof(rebuf) - 2);
 624
 625    if (pattern[0] == '\0')
 626    {
 627       *regex = NULL;
 628       return JB_ERR_OK;
 629    }
 630
 631    switch (anchoring)
 632    {
 633       case NO_ANCHORING:
 634          fmt = "%s";
 635          break;
 636       case RIGHT_ANCHORED:
 637          fmt = "%s$";
 638          break;
 639       case RIGHT_ANCHORED_HOST:
 640          fmt = "%s\\.?$";
 641          break;
 642       case LEFT_ANCHORED:
 643          fmt = "^%s";
 644          break;
 645       default:
 646          log_error(LOG_LEVEL_FATAL,
 647             "Invalid anchoring in compile_pattern %d", anchoring);
 648    }
 649
 650    *regex = zalloc(sizeof(**regex));
 651    if (NULL == *regex)
 652    {
 653       free_url_spec(url);
 654       return JB_ERR_MEMORY;
 655    }
 656
 657    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 658
 659    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 660
 661    if (errcode)
 662    {
 663       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 664       if (errlen > (sizeof(rebuf) - (size_t)1))
 665       {
 666          errlen = sizeof(rebuf) - (size_t)1;
 667       }
 668       rebuf[errlen] = '\0';
 669       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 670          pattern, url->spec, rebuf);
 671       free_url_spec(url);
 672
 673       return JB_ERR_PARSE;
 674    }
 675
 676    return JB_ERR_OK;
 677
 678 }
 679
 680
 681 /*********************************************************************
 682  *
 683  * Function    :  compile_url_pattern
 684  *
 685  * Description :  Compiles the three parts of an URL pattern.
 686  *
 687  * Parameters  :
 688  *          1  :  url = Target url_spec to be filled in.
 689  *          2  :  buf = The url pattern to compile. Will be messed up.
 690  *
 691  * Returns     :  JB_ERR_OK - Success
 692  *                JB_ERR_MEMORY - Out of memory
 693  *                JB_ERR_PARSE - Cannot parse regex
 694  *
 695  *********************************************************************/
 696 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 697 {
 698    char *p;
 699
 700    p = strchr(buf, '/');
 701    if (NULL != p)
 702    {
 703       /*
 704        * Only compile the regex if it consists of more than
 705        * a single slash, otherwise it wouldn't affect the result.
 706        */
 707       if (p[1] != '\0')
 708       {
 709          /*
 710           * XXX: does it make sense to compile the slash at the beginning?
 711           */
 712          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 713
 714          if (JB_ERR_OK != err)
 715          {
 716             return err;
 717          }
 718       }
 719       *p = '\0';
 720    }
 721
 722    /*
 723     * IPv6 numeric hostnames can contain colons, thus we need
 724     * to delimit the hostname before the real port separator.
 725     * As brackets are already used in the hostname pattern,
 726     * we use angle brackets ('<', '>') instead.
 727     */
 728    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 729    {
 730       *p++ = '\0';
 731       buf++;
 732
 733       if (*p == '\0')
 734       {
 735          /* IPv6 address without port number */
 736          p = NULL;
 737       }
 738       else if (*p != ':')
 739       {
 740          /* Garbage after address delimiter */
 741          return JB_ERR_PARSE;
 742       }
 743    }
 744    else
 745    {
 746       p = strchr(buf, ':');
 747    }
 748
 749    if (NULL != p)
 750    {
 751       *p++ = '\0';
 752       url->port_list = strdup(p);
 753       if (NULL == url->port_list)
 754       {
 755          return JB_ERR_MEMORY;
 756       }
 757    }
 758    else
 759    {
 760       url->port_list = NULL;
 761    }
 762
 763    if (buf[0] != '\0')
 764    {
 765       return compile_host_pattern(url, buf);
 766    }
 767
 768    return JB_ERR_OK;
 769
 770 }
 771
 772
 773 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 774 /*********************************************************************
 775  *
 776  * Function    :  compile_host_pattern
 777  *
 778  * Description :  Parses and compiles a host pattern.
 779  *
 780  * Parameters  :
 781  *          1  :  url = Target url_spec to be filled in.
 782  *          2  :  host_pattern = Host pattern to compile.
 783  *
 784  * Returns     :  JB_ERR_OK - Success
 785  *                JB_ERR_MEMORY - Out of memory
 786  *                JB_ERR_PARSE - Cannot parse regex
 787  *
 788  *********************************************************************/
 789 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 790 {
 791    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 792 }
 793
 794 #else
 795
 796 /*********************************************************************
 797  *
 798  * Function    :  compile_host_pattern
 799  *
 800  * Description :  Parses and "compiles" an old-school host pattern.
 801  *
 802  * Parameters  :
 803  *          1  :  url = Target url_spec to be filled in.
 804  *          2  :  host_pattern = Host pattern to parse.
 805  *
 806  * Returns     :  JB_ERR_OK - Success
 807  *                JB_ERR_MEMORY - Out of memory
 808  *                JB_ERR_PARSE - Cannot parse regex
 809  *
 810  *********************************************************************/
 811 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 812 {
 813    char *v[150];
 814    size_t size;
 815    char *p;
 816
 817    /*
 818     * Parse domain part
 819     */
 820    if (host_pattern[strlen(host_pattern) - 1] == '.')
 821    {
 822       url->unanchored |= ANCHOR_RIGHT;
 823    }
 824    if (host_pattern[0] == '.')
 825    {
 826       url->unanchored |= ANCHOR_LEFT;
 827    }
 828
 829    /*
 830     * Split domain into components
 831     */
 832    url->dbuffer = strdup(host_pattern);
 833    if (NULL == url->dbuffer)
 834    {
 835       free_url_spec(url);
 836       return JB_ERR_MEMORY;
 837    }
 838
 839    /*
 840     * Map to lower case
 841     */
 842    for (p = url->dbuffer; *p ; p++)
 843    {
 844       *p = (char)privoxy_tolower(*p);
 845    }
 846
 847    /*
 848     * Split the domain name into components
 849     */
 850    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 851
 852    if (url->dcount < 0)
 853    {
 854       free_url_spec(url);
 855       return JB_ERR_MEMORY;
 856    }
 857    else if (url->dcount != 0)
 858    {
 859       /*
 860        * Save a copy of the pointers in dvec
 861        */
 862       size = (size_t)url->dcount * sizeof(*url->dvec);
 863
 864       url->dvec = (char **)malloc(size);
 865       if (NULL == url->dvec)
 866       {
 867          free_url_spec(url);
 868          return JB_ERR_MEMORY;
 869       }
 870
 871       memcpy(url->dvec, v, size);
 872    }
 873    /*
 874     * else dcount == 0 in which case we needn't do anything,
 875     * since dvec will never be accessed and the pattern will
 876     * match all domains.
 877     */
 878    return JB_ERR_OK;
 879 }
 880
 881
 882 /*********************************************************************
 883  *
 884  * Function    :  simplematch
 885  *
 886  * Description :  String matching, with a (greedy) '*' wildcard that
 887  *                stands for zero or more arbitrary characters and
 888  *                character classes in [], which take both enumerations
 889  *                and ranges.
 890  *
 891  * Parameters  :
 892  *          1  :  pattern = pattern for matching
 893  *          2  :  text    = text to be matched
 894  *
 895  * Returns     :  0 if match, else nonzero
 896  *
 897  *********************************************************************/
 898 static int simplematch(const char *pattern, const char *text)
 899 {
 900    const unsigned char *pat = (const unsigned char *)pattern;
 901    const unsigned char *txt = (const unsigned char *)text;
 902    const unsigned char *fallback = pat;
 903    int wildcard = 0;
 904
 905    unsigned char lastchar = 'a';
 906    unsigned i;
 907    unsigned char charmap[32];
 908
 909    while (*txt)
 910    {
 911
 912       /* EOF pattern but !EOF text? */
 913       if (*pat == '\0')
 914       {
 915          if (wildcard)
 916          {
 917             pat = fallback;
 918          }
 919          else
 920          {
 921             return 1;
 922          }
 923       }
 924
 925       /* '*' in the pattern?  */
 926       if (*pat == '*')
 927       {
 928
 929          /* The pattern ends afterwards? Speed up the return. */
 930          if (*++pat == '\0')
 931          {
 932             return 0;
 933          }
 934
 935          /* Else, set wildcard mode and remember position after '*' */
 936          wildcard = 1;
 937          fallback = pat;
 938       }
 939
 940       /* Character range specification? */
 941       if (*pat == '[')
 942       {
 943          memset(charmap, '\0', sizeof(charmap));
 944
 945          while (*++pat != ']')
 946          {
 947             if (!*pat)
 948             {
 949                return 1;
 950             }
 951             else if (*pat == '-')
 952             {
 953                if ((*++pat == ']') || *pat == '\0')
 954                {
 955                   return(1);
 956                }
 957                for (i = lastchar; i <= *pat; i++)
 958                {
 959                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 960                }
 961             }
 962             else
 963             {
 964                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 965                lastchar = *pat;
 966             }
 967          }
 968       } /* -END- if Character range specification */
 969
 970
 971       /*
 972        * Char match, or char range match?
 973        */
 974       if ( (*pat == *txt)
 975       ||   (*pat == '?')
 976       ||   ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))) )
 977       {
 978          /*
 979           * Success: Go ahead
 980           */
 981          pat++;
 982       }
 983       else if (!wildcard)
 984       {
 985          /*
 986           * No match && no wildcard: No luck
 987           */
 988          return 1;
 989       }
 990       else if (pat != fallback)
 991       {
 992          /*
 993           * Increment text pointer if in char range matching
 994           */
 995          if (*pat == ']')
 996          {
 997             txt++;
 998          }
 999          /*
1000           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
1001           */
1002          pat = fallback;
1003          /*
1004           * Restart matching from current text pointer
1005           */
1006          continue;
1007       }
1008       txt++;
1009    }
1010
1011    /* Cut off extra '*'s */
1012    if(*pat == '*')  pat++;
1013
1014    /* If this is the pattern's end, fine! */
1015    return(*pat);
1016
1017 }
1018
1019
1020 /*********************************************************************
1021  *
1022  * Function    :  simple_domaincmp
1023  *
1024  * Description :  Domain-wise Compare fqdn's.  The comparison is
1025  *                both left- and right-anchored.  The individual
1026  *                domain names are compared with simplematch().
1027  *                This is only used by domain_match.
1028  *
1029  * Parameters  :
1030  *          1  :  pv = array of patterns to compare
1031  *          2  :  fv = array of domain components to compare
1032  *          3  :  len = length of the arrays (both arrays are the
1033  *                      same length - if they weren't, it couldn't
1034  *                      possibly be a match).
1035  *
1036  * Returns     :  0 => domains are equivalent, else no match.
1037  *
1038  *********************************************************************/
1039 static int simple_domaincmp(char **pv, char **fv, int len)
1040 {
1041    int n;
1042
1043    for (n = 0; n < len; n++)
1044    {
1045       if (simplematch(pv[n], fv[n]))
1046       {
1047          return 1;
1048       }
1049    }
1050
1051    return 0;
1052
1053 }
1054
1055
1056 /*********************************************************************
1057  *
1058  * Function    :  domain_match
1059  *
1060  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1061  *                pattern->unachored, the comparison is un-, left-,
1062  *                right-anchored, or both.
1063  *                The individual domain names are compared with
1064  *                simplematch().
1065  *
1066  * Parameters  :
1067  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1068  *          2  :  fqdn = domain name against which the patterns are compared.
1069  *
1070  * Returns     :  0 => domains are equivalent, else no match.
1071  *
1072  *********************************************************************/
1073 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1074 {
1075    char **pv, **fv;  /* vectors  */
1076    int    plen, flen;
1077    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1078
1079    plen = pattern->dcount;
1080    flen = fqdn->dcount;
1081
1082    if (flen < plen)
1083    {
1084       /* fqdn is too short to match this pattern */
1085       return 1;
1086    }
1087
1088    pv   = pattern->dvec;
1089    fv   = fqdn->dvec;
1090
1091    if (unanchored == ANCHOR_LEFT)
1092    {
1093       /*
1094        * Right anchored.
1095        *
1096        * Convert this into a fully anchored pattern with
1097        * the fqdn and pattern the same length
1098        */
1099       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1100       return simple_domaincmp(pv, fv, plen);
1101    }
1102    else if (unanchored == 0)
1103    {
1104       /* Fully anchored, check length */
1105       if (flen != plen)
1106       {
1107          return 1;
1108       }
1109       return simple_domaincmp(pv, fv, plen);
1110    }
1111    else if (unanchored == ANCHOR_RIGHT)
1112    {
1113       /* Left anchored, ignore all extra in fqdn */
1114       return simple_domaincmp(pv, fv, plen);
1115    }
1116    else
1117    {
1118       /* Unanchored */
1119       int n;
1120       int maxn = flen - plen;
1121       for (n = 0; n <= maxn; n++)
1122       {
1123          if (!simple_domaincmp(pv, fv, plen))
1124          {
1125             return 0;
1126          }
1127          /*
1128           * Doesn't match from start of fqdn
1129           * Try skipping first part of fqdn
1130           */
1131          fv++;
1132       }
1133       return 1;
1134    }
1135
1136 }
1137 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1138
1139
1140 /*********************************************************************
1141  *
1142  * Function    :  create_url_spec
1143  *
1144  * Description :  Creates a "url_spec" structure from a string.
1145  *                When finished, free with free_url_spec().
1146  *
1147  * Parameters  :
1148  *          1  :  url = Target url_spec to be filled in.  Will be
1149  *                      zeroed before use.
1150  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1151  *                      contents of this buffer are destroyed by this
1152  *                      function.  If this function succeeds, the
1153  *                      buffer is copied to url->spec.  If this
1154  *                      function fails, the contents of the buffer
1155  *                      are lost forever.
1156  *
1157  * Returns     :  JB_ERR_OK - Success
1158  *                JB_ERR_MEMORY - Out of memory
1159  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1160  *                               written to system log)
1161  *
1162  *********************************************************************/
1163 jb_err create_url_spec(struct url_spec *url, char *buf)
1164 {
1165    assert(url);
1166    assert(buf);
1167
1168    memset(url, '\0', sizeof(*url));
1169
1170    /* Remember the original specification for the CGI pages. */
1171    url->spec = strdup(buf);
1172    if (NULL == url->spec)
1173    {
1174       return JB_ERR_MEMORY;
1175    }
1176
1177    /* Is it a tag pattern? */
1178    if (0 == strncmpic(url->spec, "TAG:", 4))
1179    {
1180       /* The pattern starts with the first character after "TAG:" */
1181       const char *tag_pattern = buf + 4;
1182       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1183    }
1184
1185    /* If it isn't a tag pattern it must be an URL pattern. */
1186    return compile_url_pattern(url, buf);
1187 }
1188
1189
1190 /*********************************************************************
1191  *
1192  * Function    :  free_url_spec
1193  *
1194  * Description :  Called from the "unloaders".  Freez the url
1195  *                structure elements.
1196  *
1197  * Parameters  :
1198  *          1  :  url = pointer to a url_spec structure.
1199  *
1200  * Returns     :  N/A
1201  *
1202  *********************************************************************/
1203 void free_url_spec(struct url_spec *url)
1204 {
1205    if (url == NULL) return;
1206
1207    freez(url->spec);
1208 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1209    if (url->host_regex)
1210    {
1211       regfree(url->host_regex);
1212       freez(url->host_regex);
1213    }
1214 #else
1215    freez(url->dbuffer);
1216    freez(url->dvec);
1217    url->dcount = 0;
1218 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1219    freez(url->port_list);
1220    if (url->preg)
1221    {
1222       regfree(url->preg);
1223       freez(url->preg);
1224    }
1225    if (url->tag_regex)
1226    {
1227       regfree(url->tag_regex);
1228       freez(url->tag_regex);
1229    }
1230 }
1231
1232
1233 /*********************************************************************
1234  *
1235  * Function    :  port_matches
1236  *
1237  * Description :  Compares a port against a port list.
1238  *
1239  * Parameters  :
1240  *          1  :  port      = The port to check.
1241  *          2  :  port_list = The list of port to compare with.
1242  *
1243  * Returns     :  TRUE for yes, FALSE otherwise.
1244  *
1245  *********************************************************************/
1246 static int port_matches(const int port, const char *port_list)
1247 {
1248    return ((NULL == port_list) || match_portlist(port_list, port));
1249 }
1250
1251
1252 /*********************************************************************
1253  *
1254  * Function    :  host_matches
1255  *
1256  * Description :  Compares a host against a host pattern.
1257  *
1258  * Parameters  :
1259  *          1  :  url = The URL to match
1260  *          2  :  pattern = The URL pattern
1261  *
1262  * Returns     :  TRUE for yes, FALSE otherwise.
1263  *
1264  *********************************************************************/
1265 static int host_matches(const struct http_request *http,
1266                         const struct url_spec *pattern)
1267 {
1268 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1269    return ((NULL == pattern->host_regex)
1270       || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
1271 #else
1272    return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
1273 #endif
1274 }
1275
1276
1277 /*********************************************************************
1278  *
1279  * Function    :  path_matches
1280  *
1281  * Description :  Compares a path against a path pattern.
1282  *
1283  * Parameters  :
1284  *          1  :  path = The path to match
1285  *          2  :  pattern = The URL pattern
1286  *
1287  * Returns     :  TRUE for yes, FALSE otherwise.
1288  *
1289  *********************************************************************/
1290 static int path_matches(const char *path, const struct url_spec *pattern)
1291 {
1292    return ((NULL == pattern->preg)
1293       || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
1294 }
1295
1296
1297 /*********************************************************************
1298  *
1299  * Function    :  url_match
1300  *
1301  * Description :  Compare a URL against a URL pattern.
1302  *
1303  * Parameters  :
1304  *          1  :  pattern = a URL pattern
1305  *          2  :  url = URL to match
1306  *
1307  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1308  *
1309  *********************************************************************/
1310 int url_match(const struct url_spec *pattern,
1311               const struct http_request *http)
1312 {
1313    if (pattern->tag_regex != NULL)
1314    {
1315       /* It's a tag pattern and shouldn't be matched against URLs */
1316       return 0;
1317    }
1318
1319    return (port_matches(http->port, pattern->port_list)
1320       && host_matches(http, pattern) && path_matches(http->path, pattern));
1321
1322 }
1323
1324
1325 /*********************************************************************
1326  *
1327  * Function    :  match_portlist
1328  *
1329  * Description :  Check if a given number is covered by a comma
1330  *                separated list of numbers and ranges (a,b-c,d,..)
1331  *
1332  * Parameters  :
1333  *          1  :  portlist = String with list
1334  *          2  :  port = port to check
1335  *
1336  * Returns     :  0 => no match
1337  *                1 => match
1338  *
1339  *********************************************************************/
1340 int match_portlist(const char *portlist, int port)
1341 {
1342    char *min, *max, *next, *portlist_copy;
1343
1344    min = portlist_copy = strdup(portlist);
1345
1346    /*
1347     * Zero-terminate first item and remember offset for next
1348     */
1349    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1350    {
1351       *next++ = '\0';
1352    }
1353
1354    /*
1355     * Loop through all items, checking for match
1356     */
1357    while (NULL != min)
1358    {
1359       if (NULL == (max = strchr(min, (int) '-')))
1360       {
1361          /*
1362           * No dash, check for equality
1363           */
1364          if (port == atoi(min))
1365          {
1366             freez(portlist_copy);
1367             return(1);
1368          }
1369       }
1370       else
1371       {
1372          /*
1373           * This is a range, so check if between min and max,
1374           * or, if max was omitted, between min and 65K
1375           */
1376          *max++ = '\0';
1377          if(port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1378          {
1379             freez(portlist_copy);
1380             return(1);
1381          }
1382
1383       }
1384
1385       /*
1386        * Jump to next item
1387        */
1388       min = next;
1389
1390       /*
1391        * Zero-terminate next item and remember offset for n+1
1392        */
1393       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1394       {
1395          *next++ = '\0';
1396       }
1397    }
1398
1399    freez(portlist_copy);
1400    return 0;
1401
1402 }
1403
1404
1405 /*********************************************************************
1406  *
1407  * Function    :  parse_forwarder_address
1408  *
1409  * Description :  Parse out the host and port from a forwarder address.
1410  *
1411  * Parameters  :
1412  *          1  :  address = The forwarder address to parse.
1413  *          2  :  hostname = Used to return the hostname. NULL on error.
1414  *          3  :  port = Used to return the port. Untouched if no port
1415  *                       is specified.
1416  *
1417  * Returns     :  JB_ERR_OK on success
1418  *                JB_ERR_MEMORY on out of memory
1419  *                JB_ERR_PARSE on malformed address.
1420  *
1421  *********************************************************************/
1422 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1423 {
1424    char *p = address;
1425
1426    if ((*address == '[') && (NULL == strchr(address, ']')))
1427    {
1428       /* XXX: Should do some more validity checks here. */
1429       return JB_ERR_PARSE;
1430    }
1431
1432    *hostname = strdup(address);
1433    if (NULL == *hostname)
1434    {
1435       return JB_ERR_MEMORY;
1436    }
1437
1438    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1439    {
1440       *p++ = '\0';
1441       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1442       if (*p == ':')
1443       {
1444          *port = (int)strtol(++p, NULL, 0);
1445       }
1446    }
1447    else if (NULL != (p = strchr(*hostname, ':')))
1448    {
1449       *p++ = '\0';
1450       *port = (int)strtol(p, NULL, 0);
1451    }
1452
1453    return JB_ERR_OK;
1454
1455 }
1456
1457
1458 /*
1459   Local Variables:
1460   tab-width: 3
1461   end:
1462 */