urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.72 2012/07/23 12:42:53 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2011
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          /* Terminate hostname and point to start of port string */
 386          *port++ = '\0';
 387          http->port = atoi(port);
 388       }
 389       else
 390       {
 391          /* No port specified. */
 392          http->port = (http->ssl ? 443 : 80);
 393       }
 394
 395       http->host = strdup_or_die(host);
 396
 397       freez(buf);
 398    }
 399
 400 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 401    return JB_ERR_OK;
 402 #else
 403    /* Split domain name so we can compare it against wildcards */
 404    return init_domain_components(http);
 405 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 406
 407 }
 408
 409
 410 /*********************************************************************
 411  *
 412  * Function    :  unknown_method
 413  *
 414  * Description :  Checks whether a method is unknown.
 415  *
 416  * Parameters  :
 417  *          1  :  method = points to a http method
 418  *
 419  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 420  *
 421  *********************************************************************/
 422 static int unknown_method(const char *method)
 423 {
 424    static const char * const known_http_methods[] = {
 425       /* Basic HTTP request type */
 426       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 427       /* webDAV extensions (RFC2518) */
 428       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 429       /*
 430        * Microsoft webDAV extension for Exchange 2000.  See:
 431        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 432        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 433        */
 434       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 435       /*
 436        * Another Microsoft webDAV extension for Exchange 2000.  See:
 437        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 438        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 439        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 440        */
 441       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 442       /*
 443        * Yet another WebDAV extension, this time for
 444        * Web Distributed Authoring and Versioning (RFC3253)
 445        */
 446       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 447       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 448    };
 449    int i;
 450
 451    for (i = 0; i < SZ(known_http_methods); i++)
 452    {
 453       if (0 == strcmpic(method, known_http_methods[i]))
 454       {
 455          return FALSE;
 456       }
 457    }
 458
 459    return TRUE;
 460
 461 }
 462
 463
 464 /*********************************************************************
 465  *
 466  * Function    :  parse_http_request
 467  *
 468  * Description :  Parse out the host and port from the URL.  Find the
 469  *                hostname & path, port (if ':'), and/or password (if '@')
 470  *
 471  * Parameters  :
 472  *          1  :  req = HTTP request line to break down
 473  *          2  :  http = pointer to the http structure to hold elements
 474  *
 475  * Returns     :  JB_ERR_OK on success
 476  *                JB_ERR_CGI_PARAMS on malformed command/URL
 477  *                                  or >100 domains deep.
 478  *
 479  *********************************************************************/
 480 jb_err parse_http_request(const char *req, struct http_request *http)
 481 {
 482    char *buf;
 483    char *v[10]; /* XXX: Why 10? We should only need three. */
 484    int n;
 485    jb_err err;
 486
 487    memset(http, '\0', sizeof(*http));
 488
 489    buf = strdup_or_die(req);
 490
 491    n = ssplit(buf, " \r\n", v, SZ(v));
 492    if (n != 3)
 493    {
 494       freez(buf);
 495       return JB_ERR_PARSE;
 496    }
 497
 498    /*
 499     * Fail in case of unknown methods
 500     * which we might not handle correctly.
 501     *
 502     * XXX: There should be a config option
 503     * to forward requests with unknown methods
 504     * anyway. Most of them don't need special
 505     * steps.
 506     */
 507    if (unknown_method(v[0]))
 508    {
 509       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 510       freez(buf);
 511       return JB_ERR_PARSE;
 512    }
 513
 514    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 515    {
 516       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 517          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 518       freez(buf);
 519       return JB_ERR_PARSE;
 520    }
 521
 522    http->ssl = !strcmpic(v[0], "CONNECT");
 523
 524    err = parse_http_url(v[1], http, !http->ssl);
 525    if (err)
 526    {
 527       freez(buf);
 528       return err;
 529    }
 530
 531    /*
 532     * Copy the details into the structure
 533     */
 534    http->cmd = strdup_or_die(req);
 535    http->gpc = strdup_or_die(v[0]);
 536    http->ver = strdup_or_die(v[2]);
 537
 538    freez(buf);
 539
 540    return JB_ERR_OK;
 541
 542 }
 543
 544
 545 /*********************************************************************
 546  *
 547  * Function    :  compile_pattern
 548  *
 549  * Description :  Compiles a host, domain or TAG pattern.
 550  *
 551  * Parameters  :
 552  *          1  :  pattern = The pattern to compile.
 553  *          2  :  anchoring = How the regex should be modified
 554  *                            before compilation. Can be either
 555  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 556  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 557  *          3  :  url     = In case of failures, the spec member is
 558  *                          logged and the structure freed.
 559  *          4  :  regex   = Where the compiled regex should be stored.
 560  *
 561  * Returns     :  JB_ERR_OK - Success
 562  *                JB_ERR_MEMORY - Out of memory
 563  *                JB_ERR_PARSE - Cannot parse regex
 564  *
 565  *********************************************************************/
 566 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 567                               struct url_spec *url, regex_t **regex)
 568 {
 569    int errcode;
 570    char rebuf[BUFFER_SIZE];
 571    const char *fmt = NULL;
 572
 573    assert(pattern);
 574    assert(strlen(pattern) < sizeof(rebuf) - 2);
 575
 576    if (pattern[0] == '\0')
 577    {
 578       *regex = NULL;
 579       return JB_ERR_OK;
 580    }
 581
 582    switch (anchoring)
 583    {
 584       case NO_ANCHORING:
 585          fmt = "%s";
 586          break;
 587       case RIGHT_ANCHORED:
 588          fmt = "%s$";
 589          break;
 590       case RIGHT_ANCHORED_HOST:
 591          fmt = "%s\\.?$";
 592          break;
 593       case LEFT_ANCHORED:
 594          fmt = "^%s";
 595          break;
 596       default:
 597          log_error(LOG_LEVEL_FATAL,
 598             "Invalid anchoring in compile_pattern %d", anchoring);
 599    }
 600
 601    *regex = zalloc(sizeof(**regex));
 602    if (NULL == *regex)
 603    {
 604       free_url_spec(url);
 605       return JB_ERR_MEMORY;
 606    }
 607
 608    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 609
 610    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 611
 612    if (errcode)
 613    {
 614       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 615       if (errlen > (sizeof(rebuf) - (size_t)1))
 616       {
 617          errlen = sizeof(rebuf) - (size_t)1;
 618       }
 619       rebuf[errlen] = '\0';
 620       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 621          pattern, url->spec, rebuf);
 622       free_url_spec(url);
 623
 624       return JB_ERR_PARSE;
 625    }
 626
 627    return JB_ERR_OK;
 628
 629 }
 630
 631
 632 /*********************************************************************
 633  *
 634  * Function    :  compile_url_pattern
 635  *
 636  * Description :  Compiles the three parts of an URL pattern.
 637  *
 638  * Parameters  :
 639  *          1  :  url = Target url_spec to be filled in.
 640  *          2  :  buf = The url pattern to compile. Will be messed up.
 641  *
 642  * Returns     :  JB_ERR_OK - Success
 643  *                JB_ERR_MEMORY - Out of memory
 644  *                JB_ERR_PARSE - Cannot parse regex
 645  *
 646  *********************************************************************/
 647 static jb_err compile_url_pattern(struct url_spec *url, char *buf)
 648 {
 649    char *p;
 650
 651    p = strchr(buf, '/');
 652    if (NULL != p)
 653    {
 654       /*
 655        * Only compile the regex if it consists of more than
 656        * a single slash, otherwise it wouldn't affect the result.
 657        */
 658       if (p[1] != '\0')
 659       {
 660          /*
 661           * XXX: does it make sense to compile the slash at the beginning?
 662           */
 663          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->preg);
 664
 665          if (JB_ERR_OK != err)
 666          {
 667             return err;
 668          }
 669       }
 670       *p = '\0';
 671    }
 672
 673    /*
 674     * IPv6 numeric hostnames can contain colons, thus we need
 675     * to delimit the hostname before the real port separator.
 676     * As brackets are already used in the hostname pattern,
 677     * we use angle brackets ('<', '>') instead.
 678     */
 679    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 680    {
 681       *p++ = '\0';
 682       buf++;
 683
 684       if (*p == '\0')
 685       {
 686          /* IPv6 address without port number */
 687          p = NULL;
 688       }
 689       else if (*p != ':')
 690       {
 691          /* Garbage after address delimiter */
 692          return JB_ERR_PARSE;
 693       }
 694    }
 695    else
 696    {
 697       p = strchr(buf, ':');
 698    }
 699
 700    if (NULL != p)
 701    {
 702       *p++ = '\0';
 703       url->port_list = strdup_or_die(p);
 704    }
 705    else
 706    {
 707       url->port_list = NULL;
 708    }
 709
 710    if (buf[0] != '\0')
 711    {
 712       return compile_host_pattern(url, buf);
 713    }
 714
 715    return JB_ERR_OK;
 716
 717 }
 718
 719
 720 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 721 /*********************************************************************
 722  *
 723  * Function    :  compile_host_pattern
 724  *
 725  * Description :  Parses and compiles a host pattern.
 726  *
 727  * Parameters  :
 728  *          1  :  url = Target url_spec to be filled in.
 729  *          2  :  host_pattern = Host pattern to compile.
 730  *
 731  * Returns     :  JB_ERR_OK - Success
 732  *                JB_ERR_MEMORY - Out of memory
 733  *                JB_ERR_PARSE - Cannot parse regex
 734  *
 735  *********************************************************************/
 736 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 737 {
 738    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->host_regex);
 739 }
 740
 741 #else
 742
 743 /*********************************************************************
 744  *
 745  * Function    :  compile_host_pattern
 746  *
 747  * Description :  Parses and "compiles" an old-school host pattern.
 748  *
 749  * Parameters  :
 750  *          1  :  url = Target url_spec to be filled in.
 751  *          2  :  host_pattern = Host pattern to parse.
 752  *
 753  * Returns     :  JB_ERR_OK - Success
 754  *                JB_ERR_PARSE - Cannot parse regex
 755  *
 756  *********************************************************************/
 757 static jb_err compile_host_pattern(struct url_spec *url, const char *host_pattern)
 758 {
 759    char *v[150];
 760    size_t size;
 761    char *p;
 762
 763    /*
 764     * Parse domain part
 765     */
 766    if (host_pattern[strlen(host_pattern) - 1] == '.')
 767    {
 768       url->unanchored |= ANCHOR_RIGHT;
 769    }
 770    if (host_pattern[0] == '.')
 771    {
 772       url->unanchored |= ANCHOR_LEFT;
 773    }
 774
 775    /*
 776     * Split domain into components
 777     */
 778    url->dbuffer = strdup_or_die(host_pattern);
 779
 780    /*
 781     * Map to lower case
 782     */
 783    for (p = url->dbuffer; *p ; p++)
 784    {
 785       *p = (char)privoxy_tolower(*p);
 786    }
 787
 788    /*
 789     * Split the domain name into components
 790     */
 791    url->dcount = ssplit(url->dbuffer, ".", v, SZ(v));
 792
 793    if (url->dcount < 0)
 794    {
 795       free_url_spec(url);
 796       return JB_ERR_MEMORY;
 797    }
 798    else if (url->dcount != 0)
 799    {
 800       /*
 801        * Save a copy of the pointers in dvec
 802        */
 803       size = (size_t)url->dcount * sizeof(*url->dvec);
 804
 805       url->dvec = malloc_or_die(size);
 806
 807       memcpy(url->dvec, v, size);
 808    }
 809    /*
 810     * else dcount == 0 in which case we needn't do anything,
 811     * since dvec will never be accessed and the pattern will
 812     * match all domains.
 813     */
 814    return JB_ERR_OK;
 815 }
 816
 817
 818 /*********************************************************************
 819  *
 820  * Function    :  simplematch
 821  *
 822  * Description :  String matching, with a (greedy) '*' wildcard that
 823  *                stands for zero or more arbitrary characters and
 824  *                character classes in [], which take both enumerations
 825  *                and ranges.
 826  *
 827  * Parameters  :
 828  *          1  :  pattern = pattern for matching
 829  *          2  :  text    = text to be matched
 830  *
 831  * Returns     :  0 if match, else nonzero
 832  *
 833  *********************************************************************/
 834 static int simplematch(const char *pattern, const char *text)
 835 {
 836    const unsigned char *pat = (const unsigned char *)pattern;
 837    const unsigned char *txt = (const unsigned char *)text;
 838    const unsigned char *fallback = pat;
 839    int wildcard = 0;
 840
 841    unsigned char lastchar = 'a';
 842    unsigned i;
 843    unsigned char charmap[32];
 844
 845    while (*txt)
 846    {
 847
 848       /* EOF pattern but !EOF text? */
 849       if (*pat == '\0')
 850       {
 851          if (wildcard)
 852          {
 853             pat = fallback;
 854          }
 855          else
 856          {
 857             return 1;
 858          }
 859       }
 860
 861       /* '*' in the pattern?  */
 862       if (*pat == '*')
 863       {
 864
 865          /* The pattern ends afterwards? Speed up the return. */
 866          if (*++pat == '\0')
 867          {
 868             return 0;
 869          }
 870
 871          /* Else, set wildcard mode and remember position after '*' */
 872          wildcard = 1;
 873          fallback = pat;
 874       }
 875
 876       /* Character range specification? */
 877       if (*pat == '[')
 878       {
 879          memset(charmap, '\0', sizeof(charmap));
 880
 881          while (*++pat != ']')
 882          {
 883             if (!*pat)
 884             {
 885                return 1;
 886             }
 887             else if (*pat == '-')
 888             {
 889                if ((*++pat == ']') || *pat == '\0')
 890                {
 891                   return(1);
 892                }
 893                for (i = lastchar; i <= *pat; i++)
 894                {
 895                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 896                }
 897             }
 898             else
 899             {
 900                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 901                lastchar = *pat;
 902             }
 903          }
 904       } /* -END- if Character range specification */
 905
 906
 907       /*
 908        * Char match, or char range match?
 909        */
 910       if ((*pat == *txt)
 911        || (*pat == '?')
 912        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 913       {
 914          /*
 915           * Success: Go ahead
 916           */
 917          pat++;
 918       }
 919       else if (!wildcard)
 920       {
 921          /*
 922           * No match && no wildcard: No luck
 923           */
 924          return 1;
 925       }
 926       else if (pat != fallback)
 927       {
 928          /*
 929           * Increment text pointer if in char range matching
 930           */
 931          if (*pat == ']')
 932          {
 933             txt++;
 934          }
 935          /*
 936           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 937           */
 938          pat = fallback;
 939          /*
 940           * Restart matching from current text pointer
 941           */
 942          continue;
 943       }
 944       txt++;
 945    }
 946
 947    /* Cut off extra '*'s */
 948    if (*pat == '*') pat++;
 949
 950    /* If this is the pattern's end, fine! */
 951    return(*pat);
 952
 953 }
 954
 955
 956 /*********************************************************************
 957  *
 958  * Function    :  simple_domaincmp
 959  *
 960  * Description :  Domain-wise Compare fqdn's.  The comparison is
 961  *                both left- and right-anchored.  The individual
 962  *                domain names are compared with simplematch().
 963  *                This is only used by domain_match.
 964  *
 965  * Parameters  :
 966  *          1  :  pv = array of patterns to compare
 967  *          2  :  fv = array of domain components to compare
 968  *          3  :  len = length of the arrays (both arrays are the
 969  *                      same length - if they weren't, it couldn't
 970  *                      possibly be a match).
 971  *
 972  * Returns     :  0 => domains are equivalent, else no match.
 973  *
 974  *********************************************************************/
 975 static int simple_domaincmp(char **pv, char **fv, int len)
 976 {
 977    int n;
 978
 979    for (n = 0; n < len; n++)
 980    {
 981       if (simplematch(pv[n], fv[n]))
 982       {
 983          return 1;
 984       }
 985    }
 986
 987    return 0;
 988
 989 }
 990
 991
 992 /*********************************************************************
 993  *
 994  * Function    :  domain_match
 995  *
 996  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 997  *                pattern->unachored, the comparison is un-, left-,
 998  *                right-anchored, or both.
 999  *                The individual domain names are compared with
1000  *                simplematch().
1001  *
1002  * Parameters  :
1003  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
1004  *          2  :  fqdn = domain name against which the patterns are compared.
1005  *
1006  * Returns     :  0 => domains are equivalent, else no match.
1007  *
1008  *********************************************************************/
1009 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
1010 {
1011    char **pv, **fv;  /* vectors  */
1012    int    plen, flen;
1013    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1014
1015    plen = pattern->dcount;
1016    flen = fqdn->dcount;
1017
1018    if (flen < plen)
1019    {
1020       /* fqdn is too short to match this pattern */
1021       return 1;
1022    }
1023
1024    pv   = pattern->dvec;
1025    fv   = fqdn->dvec;
1026
1027    if (unanchored == ANCHOR_LEFT)
1028    {
1029       /*
1030        * Right anchored.
1031        *
1032        * Convert this into a fully anchored pattern with
1033        * the fqdn and pattern the same length
1034        */
1035       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1036       return simple_domaincmp(pv, fv, plen);
1037    }
1038    else if (unanchored == 0)
1039    {
1040       /* Fully anchored, check length */
1041       if (flen != plen)
1042       {
1043          return 1;
1044       }
1045       return simple_domaincmp(pv, fv, plen);
1046    }
1047    else if (unanchored == ANCHOR_RIGHT)
1048    {
1049       /* Left anchored, ignore all extra in fqdn */
1050       return simple_domaincmp(pv, fv, plen);
1051    }
1052    else
1053    {
1054       /* Unanchored */
1055       int n;
1056       int maxn = flen - plen;
1057       for (n = 0; n <= maxn; n++)
1058       {
1059          if (!simple_domaincmp(pv, fv, plen))
1060          {
1061             return 0;
1062          }
1063          /*
1064           * Doesn't match from start of fqdn
1065           * Try skipping first part of fqdn
1066           */
1067          fv++;
1068       }
1069       return 1;
1070    }
1071
1072 }
1073 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1074
1075
1076 /*********************************************************************
1077  *
1078  * Function    :  create_url_spec
1079  *
1080  * Description :  Creates a "url_spec" structure from a string.
1081  *                When finished, free with free_url_spec().
1082  *
1083  * Parameters  :
1084  *          1  :  url = Target url_spec to be filled in.  Will be
1085  *                      zeroed before use.
1086  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1087  *                      contents of this buffer are destroyed by this
1088  *                      function.  If this function succeeds, the
1089  *                      buffer is copied to url->spec.  If this
1090  *                      function fails, the contents of the buffer
1091  *                      are lost forever.
1092  *
1093  * Returns     :  JB_ERR_OK - Success
1094  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1095  *                               written to system log)
1096  *
1097  *********************************************************************/
1098 jb_err create_url_spec(struct url_spec *url, char *buf)
1099 {
1100    assert(url);
1101    assert(buf);
1102
1103    memset(url, '\0', sizeof(*url));
1104
1105    /* Remember the original specification for the CGI pages. */
1106    url->spec = strdup_or_die(buf);
1107
1108    /* Is it a tag pattern? */
1109    if (0 == strncmpic(url->spec, "TAG:", 4))
1110    {
1111       /* The pattern starts with the first character after "TAG:" */
1112       const char *tag_pattern = buf + 4;
1113       return compile_pattern(tag_pattern, NO_ANCHORING, url, &url->tag_regex);
1114    }
1115
1116    /* If it isn't a tag pattern it must be an URL pattern. */
1117    return compile_url_pattern(url, buf);
1118 }
1119
1120
1121 /*********************************************************************
1122  *
1123  * Function    :  free_url_spec
1124  *
1125  * Description :  Called from the "unloaders".  Freez the url
1126  *                structure elements.
1127  *
1128  * Parameters  :
1129  *          1  :  url = pointer to a url_spec structure.
1130  *
1131  * Returns     :  N/A
1132  *
1133  *********************************************************************/
1134 void free_url_spec(struct url_spec *url)
1135 {
1136    if (url == NULL) return;
1137
1138    freez(url->spec);
1139 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1140    if (url->host_regex)
1141    {
1142       regfree(url->host_regex);
1143       freez(url->host_regex);
1144    }
1145 #else
1146    freez(url->dbuffer);
1147    freez(url->dvec);
1148    url->dcount = 0;
1149 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1150    freez(url->port_list);
1151    if (url->preg)
1152    {
1153       regfree(url->preg);
1154       freez(url->preg);
1155    }
1156    if (url->tag_regex)
1157    {
1158       regfree(url->tag_regex);
1159       freez(url->tag_regex);
1160    }
1161 }
1162
1163
1164 /*********************************************************************
1165  *
1166  * Function    :  port_matches
1167  *
1168  * Description :  Compares a port against a port list.
1169  *
1170  * Parameters  :
1171  *          1  :  port      = The port to check.
1172  *          2  :  port_list = The list of port to compare with.
1173  *
1174  * Returns     :  TRUE for yes, FALSE otherwise.
1175  *
1176  *********************************************************************/
1177 static int port_matches(const int port, const char *port_list)
1178 {
1179    return ((NULL == port_list) || match_portlist(port_list, port));
1180 }
1181
1182
1183 /*********************************************************************
1184  *
1185  * Function    :  host_matches
1186  *
1187  * Description :  Compares a host against a host pattern.
1188  *
1189  * Parameters  :
1190  *          1  :  url = The URL to match
1191  *          2  :  pattern = The URL pattern
1192  *
1193  * Returns     :  TRUE for yes, FALSE otherwise.
1194  *
1195  *********************************************************************/
1196 static int host_matches(const struct http_request *http,
1197                         const struct url_spec *pattern)
1198 {
1199 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1200    return ((NULL == pattern->host_regex)
1201       || (0 == regexec(pattern->host_regex, http->host, 0, NULL, 0)));
1202 #else
1203    return ((NULL == pattern->dbuffer) || (0 == domain_match(pattern, http)));
1204 #endif
1205 }
1206
1207
1208 /*********************************************************************
1209  *
1210  * Function    :  path_matches
1211  *
1212  * Description :  Compares a path against a path pattern.
1213  *
1214  * Parameters  :
1215  *          1  :  path = The path to match
1216  *          2  :  pattern = The URL pattern
1217  *
1218  * Returns     :  TRUE for yes, FALSE otherwise.
1219  *
1220  *********************************************************************/
1221 static int path_matches(const char *path, const struct url_spec *pattern)
1222 {
1223    return ((NULL == pattern->preg)
1224       || (0 == regexec(pattern->preg, path, 0, NULL, 0)));
1225 }
1226
1227
1228 /*********************************************************************
1229  *
1230  * Function    :  url_match
1231  *
1232  * Description :  Compare a URL against a URL pattern.
1233  *
1234  * Parameters  :
1235  *          1  :  pattern = a URL pattern
1236  *          2  :  url = URL to match
1237  *
1238  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1239  *
1240  *********************************************************************/
1241 int url_match(const struct url_spec *pattern,
1242               const struct http_request *http)
1243 {
1244    if (pattern->tag_regex != NULL)
1245    {
1246       /* It's a tag pattern and shouldn't be matched against URLs */
1247       return 0;
1248    }
1249
1250    return (port_matches(http->port, pattern->port_list)
1251       && host_matches(http, pattern) && path_matches(http->path, pattern));
1252
1253 }
1254
1255
1256 /*********************************************************************
1257  *
1258  * Function    :  match_portlist
1259  *
1260  * Description :  Check if a given number is covered by a comma
1261  *                separated list of numbers and ranges (a,b-c,d,..)
1262  *
1263  * Parameters  :
1264  *          1  :  portlist = String with list
1265  *          2  :  port = port to check
1266  *
1267  * Returns     :  0 => no match
1268  *                1 => match
1269  *
1270  *********************************************************************/
1271 int match_portlist(const char *portlist, int port)
1272 {
1273    char *min, *max, *next, *portlist_copy;
1274
1275    min = portlist_copy = strdup_or_die(portlist);
1276
1277    /*
1278     * Zero-terminate first item and remember offset for next
1279     */
1280    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1281    {
1282       *next++ = '\0';
1283    }
1284
1285    /*
1286     * Loop through all items, checking for match
1287     */
1288    while (NULL != min)
1289    {
1290       if (NULL == (max = strchr(min, (int) '-')))
1291       {
1292          /*
1293           * No dash, check for equality
1294           */
1295          if (port == atoi(min))
1296          {
1297             freez(portlist_copy);
1298             return(1);
1299          }
1300       }
1301       else
1302       {
1303          /*
1304           * This is a range, so check if between min and max,
1305           * or, if max was omitted, between min and 65K
1306           */
1307          *max++ = '\0';
1308          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1309          {
1310             freez(portlist_copy);
1311             return(1);
1312          }
1313
1314       }
1315
1316       /*
1317        * Jump to next item
1318        */
1319       min = next;
1320
1321       /*
1322        * Zero-terminate next item and remember offset for n+1
1323        */
1324       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1325       {
1326          *next++ = '\0';
1327       }
1328    }
1329
1330    freez(portlist_copy);
1331    return 0;
1332
1333 }
1334
1335
1336 /*********************************************************************
1337  *
1338  * Function    :  parse_forwarder_address
1339  *
1340  * Description :  Parse out the host and port from a forwarder address.
1341  *
1342  * Parameters  :
1343  *          1  :  address = The forwarder address to parse.
1344  *          2  :  hostname = Used to return the hostname. NULL on error.
1345  *          3  :  port = Used to return the port. Untouched if no port
1346  *                       is specified.
1347  *
1348  * Returns     :  JB_ERR_OK on success
1349  *                JB_ERR_MEMORY on out of memory
1350  *                JB_ERR_PARSE on malformed address.
1351  *
1352  *********************************************************************/
1353 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1354 {
1355    char *p = address;
1356
1357    if ((*address == '[') && (NULL == strchr(address, ']')))
1358    {
1359       /* XXX: Should do some more validity checks here. */
1360       return JB_ERR_PARSE;
1361    }
1362
1363    *hostname = strdup_or_die(address);
1364
1365    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1366    {
1367       *p++ = '\0';
1368       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1369       if (*p == ':')
1370       {
1371          *port = (int)strtol(++p, NULL, 0);
1372       }
1373    }
1374    else if (NULL != (p = strchr(*hostname, ':')))
1375    {
1376       *p++ = '\0';
1377       *port = (int)strtol(p, NULL, 0);
1378    }
1379
1380    return JB_ERR_OK;
1381
1382 }
1383
1384
1385 /*
1386   Local Variables:
1387   tab-width: 3
1388   end:
1389 */