urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.81 2014/05/26 10:48:07 fabiankeil Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001-2011
  10  *                the Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  *********************************************************************/
  35
  36
  37 #include "config.h"
  38
  39 #ifndef _WIN32
  40 #include <stdio.h>
  41 #include <sys/types.h>
  42 #endif
  43
  44 #include <stdlib.h>
  45 #include <ctype.h>
  46 #include <assert.h>
  47 #include <string.h>
  48
  49 #if !defined(_WIN32) && !defined(__OS2__)
  50 #include <unistd.h>
  51 #endif
  52
  53 #include "project.h"
  54 #include "urlmatch.h"
  55 #include "ssplit.h"
  56 #include "miscutil.h"
  57 #include "errlog.h"
  58
  59 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  60
  61 enum regex_anchoring
  62 {
  63    NO_ANCHORING,
  64    LEFT_ANCHORED,
  65    RIGHT_ANCHORED,
  66    RIGHT_ANCHORED_HOST
  67 };
  68 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern);
  69
  70 /*********************************************************************
  71  *
  72  * Function    :  free_http_request
  73  *
  74  * Description :  Freez a http_request structure
  75  *
  76  * Parameters  :
  77  *          1  :  http = points to a http_request structure to free
  78  *
  79  * Returns     :  N/A
  80  *
  81  *********************************************************************/
  82 void free_http_request(struct http_request *http)
  83 {
  84    assert(http);
  85
  86    freez(http->cmd);
  87    freez(http->ocmd);
  88    freez(http->gpc);
  89    freez(http->host);
  90    freez(http->url);
  91    freez(http->hostport);
  92    freez(http->path);
  93    freez(http->ver);
  94    freez(http->host_ip_addr_str);
  95 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
  96    freez(http->dbuffer);
  97    freez(http->dvec);
  98    http->dcount = 0;
  99 #endif
 100 }
 101
 102
 103 #ifndef FEATURE_EXTENDED_HOST_PATTERNS
 104 /*********************************************************************
 105  *
 106  * Function    :  init_domain_components
 107  *
 108  * Description :  Splits the domain name so we can compare it
 109  *                against wildcards. It used to be part of
 110  *                parse_http_url, but was separated because the
 111  *                same code is required in chat in case of
 112  *                intercepted requests.
 113  *
 114  * Parameters  :
 115  *          1  :  http = pointer to the http structure to hold elements.
 116  *
 117  * Returns     :  JB_ERR_OK on success
 118  *                JB_ERR_PARSE on malformed command/URL
 119  *                             or >100 domains deep.
 120  *
 121  *********************************************************************/
 122 jb_err init_domain_components(struct http_request *http)
 123 {
 124    char *vec[BUFFER_SIZE];
 125    size_t size;
 126    char *p;
 127
 128    http->dbuffer = strdup_or_die(http->host);
 129
 130    /* map to lower case */
 131    for (p = http->dbuffer; *p ; p++)
 132    {
 133       *p = (char)privoxy_tolower(*p);
 134    }
 135
 136    /* split the domain name into components */
 137    http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
 138
 139    if (http->dcount <= 0)
 140    {
 141       /*
 142        * Error: More than SZ(vec) components in domain
 143        *    or: no components in domain
 144        */
 145       log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
 146       return JB_ERR_PARSE;
 147    }
 148
 149    /* save a copy of the pointers in dvec */
 150    size = (size_t)http->dcount * sizeof(*http->dvec);
 151
 152    http->dvec = malloc_or_die(size);
 153
 154    memcpy(http->dvec, vec, size);
 155
 156    return JB_ERR_OK;
 157 }
 158 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
 159
 160
 161 /*********************************************************************
 162  *
 163  * Function    :  url_requires_percent_encoding
 164  *
 165  * Description :  Checks if an URL contains invalid characters
 166  *                according to RFC 3986 that should be percent-encoded.
 167  *                Does not verify whether or not the passed string
 168  *                actually is a valid URL.
 169  *
 170  * Parameters  :
 171  *          1  :  url = URL to check
 172  *
 173  * Returns     :  True in case of valid URLs, false otherwise
 174  *
 175  *********************************************************************/
 176 int url_requires_percent_encoding(const char *url)
 177 {
 178    static const char allowed_characters[128] = {
 179       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 180       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 181       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
 182       '\0', '\0', '\0', '!',  '\0', '#',  '$',  '%',  '&',  '\'',
 183       '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',  '0',  '1',
 184       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
 185       '\0', '=',  '\0', '?',  '@',  'A',  'B',  'C',  'D',  'E',
 186       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
 187       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y',
 188       'Z',  '[',  '\0', ']',  '\0', '_',  '\0', 'a',  'b',  'c',
 189       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',
 190       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
 191       'x',  'y',  'z',  '\0', '\0', '\0', '~',  '\0'
 192    };
 193
 194    while (*url != '\0')
 195    {
 196       const unsigned int i = (unsigned char)*url++;
 197       if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
 198       {
 199          return TRUE;
 200       }
 201    }
 202
 203    return FALSE;
 204
 205 }
 206
 207
 208 /*********************************************************************
 209  *
 210  * Function    :  parse_http_url
 211  *
 212  * Description :  Parse out the host and port from the URL.  Find the
 213  *                hostname & path, port (if ':'), and/or password (if '@')
 214  *
 215  * Parameters  :
 216  *          1  :  url = URL (or is it URI?) to break down
 217  *          2  :  http = pointer to the http structure to hold elements.
 218  *                       Must be initialized with valid values (like NULLs).
 219  *          3  :  require_protocol = Whether or not URLs without
 220  *                                   protocol are acceptable.
 221  *
 222  * Returns     :  JB_ERR_OK on success
 223  *                JB_ERR_PARSE on malformed command/URL
 224  *                             or >100 domains deep.
 225  *
 226  *********************************************************************/
 227 jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
 228 {
 229    int host_available = 1; /* A proxy can dream. */
 230
 231    /*
 232     * Save our initial URL
 233     */
 234    http->url = strdup_or_die(url);
 235
 236    /*
 237     * Check for * URI. If found, we're done.
 238     */
 239    if (*http->url == '*')
 240    {
 241       http->path = strdup_or_die("*");
 242       http->hostport = strdup_or_die("");
 243       if (http->url[1] != '\0')
 244       {
 245          return JB_ERR_PARSE;
 246       }
 247       return JB_ERR_OK;
 248    }
 249
 250
 251    /*
 252     * Split URL into protocol,hostport,path.
 253     */
 254    {
 255       char *buf;
 256       char *url_noproto;
 257       char *url_path;
 258
 259       buf = strdup_or_die(url);
 260
 261       /* Find the start of the URL in our scratch space */
 262       url_noproto = buf;
 263       if (strncmpic(url_noproto, "http://",  7) == 0)
 264       {
 265          url_noproto += 7;
 266       }
 267       else if (strncmpic(url_noproto, "https://", 8) == 0)
 268       {
 269          /*
 270           * Should only happen when called from cgi_show_url_info().
 271           */
 272          url_noproto += 8;
 273          http->ssl = 1;
 274       }
 275       else if (*url_noproto == '/')
 276       {
 277         /*
 278          * Short request line without protocol and host.
 279          * Most likely because the client's request
 280          * was intercepted and redirected into Privoxy.
 281          */
 282          http->host = NULL;
 283          host_available = 0;
 284       }
 285       else if (require_protocol)
 286       {
 287          freez(buf);
 288          return JB_ERR_PARSE;
 289       }
 290
 291       url_path = strchr(url_noproto, '/');
 292       if (url_path != NULL)
 293       {
 294          /*
 295           * Got a path.
 296           *
 297           * NOTE: The following line ignores the path for HTTPS URLS.
 298           * This means that you get consistent behaviour if you type a
 299           * https URL in and it's parsed by the function.  (When the
 300           * URL is actually retrieved, SSL hides the path part).
 301           */
 302          http->path = strdup_or_die(http->ssl ? "/" : url_path);
 303          *url_path = '\0';
 304          http->hostport = strdup_or_die(url_noproto);
 305       }
 306       else
 307       {
 308          /*
 309           * Repair broken HTTP requests that don't contain a path,
 310           * or CONNECT requests
 311           */
 312          http->path = strdup_or_die("/");
 313          http->hostport = strdup_or_die(url_noproto);
 314       }
 315
 316       freez(buf);
 317    }
 318
 319    if (!host_available)
 320    {
 321       /* Without host, there is nothing left to do here */
 322       return JB_ERR_OK;
 323    }
 324
 325    /*
 326     * Split hostport into user/password (ignored), host, port.
 327     */
 328    {
 329       char *buf;
 330       char *host;
 331       char *port;
 332
 333       buf = strdup_or_die(http->hostport);
 334
 335       /* check if url contains username and/or password */
 336       host = strchr(buf, '@');
 337       if (host != NULL)
 338       {
 339          /* Contains username/password, skip it and the @ sign. */
 340          host++;
 341       }
 342       else
 343       {
 344          /* No username or password. */
 345          host = buf;
 346       }
 347
 348       /* Move after hostname before port number */
 349       if (*host == '[')
 350       {
 351          /* Numeric IPv6 address delimited by brackets */
 352          host++;
 353          port = strchr(host, ']');
 354
 355          if (port == NULL)
 356          {
 357             /* Missing closing bracket */
 358             freez(buf);
 359             return JB_ERR_PARSE;
 360          }
 361
 362          *port++ = '\0';
 363
 364          if (*port == '\0')
 365          {
 366             port = NULL;
 367          }
 368          else if (*port != ':')
 369          {
 370             /* Garbage after closing bracket */
 371             freez(buf);
 372             return JB_ERR_PARSE;
 373          }
 374       }
 375       else
 376       {
 377          /* Plain non-escaped hostname */
 378          port = strchr(host, ':');
 379       }
 380
 381       /* check if url contains port */
 382       if (port != NULL)
 383       {
 384          /* Contains port */
 385          char *endptr;
 386          long parsed_port;
 387          /* Terminate hostname and point to start of port string */
 388          *port++ = '\0';
 389          parsed_port = strtol(port, &endptr, 10);
 390          if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
 391          {
 392             log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
 393             freez(buf);
 394             return JB_ERR_PARSE;
 395          }
 396          http->port = (int)parsed_port;
 397       }
 398       else
 399       {
 400          /* No port specified. */
 401          http->port = (http->ssl ? 443 : 80);
 402       }
 403
 404       http->host = strdup_or_die(host);
 405
 406       freez(buf);
 407    }
 408
 409 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 410    return JB_ERR_OK;
 411 #else
 412    /* Split domain name so we can compare it against wildcards */
 413    return init_domain_components(http);
 414 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
 415
 416 }
 417
 418
 419 /*********************************************************************
 420  *
 421  * Function    :  unknown_method
 422  *
 423  * Description :  Checks whether a method is unknown.
 424  *
 425  * Parameters  :
 426  *          1  :  method = points to a http method
 427  *
 428  * Returns     :  TRUE if it's unknown, FALSE otherwise.
 429  *
 430  *********************************************************************/
 431 static int unknown_method(const char *method)
 432 {
 433    static const char * const known_http_methods[] = {
 434       /* Basic HTTP request type */
 435       "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
 436       /* webDAV extensions (RFC2518) */
 437       "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
 438       /*
 439        * Microsoft webDAV extension for Exchange 2000.  See:
 440        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 441        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 442        */
 443       "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
 444       /*
 445        * Another Microsoft webDAV extension for Exchange 2000.  See:
 446        * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
 447        * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
 448        * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
 449        */
 450       "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
 451       /*
 452        * Yet another WebDAV extension, this time for
 453        * Web Distributed Authoring and Versioning (RFC3253)
 454        */
 455       "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
 456       "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
 457    };
 458    int i;
 459
 460    for (i = 0; i < SZ(known_http_methods); i++)
 461    {
 462       if (0 == strcmpic(method, known_http_methods[i]))
 463       {
 464          return FALSE;
 465       }
 466    }
 467
 468    return TRUE;
 469
 470 }
 471
 472
 473 /*********************************************************************
 474  *
 475  * Function    :  parse_http_request
 476  *
 477  * Description :  Parse out the host and port from the URL.  Find the
 478  *                hostname & path, port (if ':'), and/or password (if '@')
 479  *
 480  * Parameters  :
 481  *          1  :  req = HTTP request line to break down
 482  *          2  :  http = pointer to the http structure to hold elements
 483  *
 484  * Returns     :  JB_ERR_OK on success
 485  *                JB_ERR_CGI_PARAMS on malformed command/URL
 486  *                                  or >100 domains deep.
 487  *
 488  *********************************************************************/
 489 jb_err parse_http_request(const char *req, struct http_request *http)
 490 {
 491    char *buf;
 492    char *v[3];
 493    int n;
 494    jb_err err;
 495
 496    memset(http, '\0', sizeof(*http));
 497
 498    buf = strdup_or_die(req);
 499
 500    n = ssplit(buf, " \r\n", v, SZ(v));
 501    if (n != 3)
 502    {
 503       freez(buf);
 504       return JB_ERR_PARSE;
 505    }
 506
 507    /*
 508     * Fail in case of unknown methods
 509     * which we might not handle correctly.
 510     *
 511     * XXX: There should be a config option
 512     * to forward requests with unknown methods
 513     * anyway. Most of them don't need special
 514     * steps.
 515     */
 516    if (unknown_method(v[0]))
 517    {
 518       log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
 519       freez(buf);
 520       return JB_ERR_PARSE;
 521    }
 522
 523    if (strcmpic(v[2], "HTTP/1.1") && strcmpic(v[2], "HTTP/1.0"))
 524    {
 525       log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
 526          "versions are 1.0 and 1.1. This rules out: %s", v[2]);
 527       freez(buf);
 528       return JB_ERR_PARSE;
 529    }
 530
 531    http->ssl = !strcmpic(v[0], "CONNECT");
 532
 533    err = parse_http_url(v[1], http, !http->ssl);
 534    if (err)
 535    {
 536       freez(buf);
 537       return err;
 538    }
 539
 540    /*
 541     * Copy the details into the structure
 542     */
 543    http->cmd = strdup_or_die(req);
 544    http->gpc = strdup_or_die(v[0]);
 545    http->ver = strdup_or_die(v[2]);
 546    http->ocmd = strdup_or_die(http->cmd);
 547
 548    freez(buf);
 549
 550    return JB_ERR_OK;
 551
 552 }
 553
 554
 555 /*********************************************************************
 556  *
 557  * Function    :  compile_pattern
 558  *
 559  * Description :  Compiles a host, domain or TAG pattern.
 560  *
 561  * Parameters  :
 562  *          1  :  pattern = The pattern to compile.
 563  *          2  :  anchoring = How the regex should be modified
 564  *                            before compilation. Can be either
 565  *                            one of NO_ANCHORING, LEFT_ANCHORED,
 566  *                            RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
 567  *          3  :  url     = In case of failures, the spec member is
 568  *                          logged and the structure freed.
 569  *          4  :  regex   = Where the compiled regex should be stored.
 570  *
 571  * Returns     :  JB_ERR_OK - Success
 572  *                JB_ERR_MEMORY - Out of memory
 573  *                JB_ERR_PARSE - Cannot parse regex
 574  *
 575  *********************************************************************/
 576 static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
 577                               struct pattern_spec *url, regex_t **regex)
 578 {
 579    int errcode;
 580    char rebuf[BUFFER_SIZE];
 581    const char *fmt = NULL;
 582
 583    assert(pattern);
 584    assert(strlen(pattern) < sizeof(rebuf) - 2);
 585
 586    if (pattern[0] == '\0')
 587    {
 588       *regex = NULL;
 589       return JB_ERR_OK;
 590    }
 591
 592    switch (anchoring)
 593    {
 594       case NO_ANCHORING:
 595          fmt = "%s";
 596          break;
 597       case RIGHT_ANCHORED:
 598          fmt = "%s$";
 599          break;
 600       case RIGHT_ANCHORED_HOST:
 601          fmt = "%s\\.?$";
 602          break;
 603       case LEFT_ANCHORED:
 604          fmt = "^%s";
 605          break;
 606       default:
 607          log_error(LOG_LEVEL_FATAL,
 608             "Invalid anchoring in compile_pattern %d", anchoring);
 609    }
 610
 611    *regex = zalloc(sizeof(**regex));
 612    if (NULL == *regex)
 613    {
 614       free_pattern_spec(url);
 615       return JB_ERR_MEMORY;
 616    }
 617
 618    snprintf(rebuf, sizeof(rebuf), fmt, pattern);
 619
 620    errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 621
 622    if (errcode)
 623    {
 624       size_t errlen = regerror(errcode, *regex, rebuf, sizeof(rebuf));
 625       if (errlen > (sizeof(rebuf) - (size_t)1))
 626       {
 627          errlen = sizeof(rebuf) - (size_t)1;
 628       }
 629       rebuf[errlen] = '\0';
 630       log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
 631          pattern, url->spec, rebuf);
 632       free_pattern_spec(url);
 633
 634       return JB_ERR_PARSE;
 635    }
 636
 637    return JB_ERR_OK;
 638
 639 }
 640
 641
 642 /*********************************************************************
 643  *
 644  * Function    :  compile_url_pattern
 645  *
 646  * Description :  Compiles the three parts of an URL pattern.
 647  *
 648  * Parameters  :
 649  *          1  :  url = Target pattern_spec to be filled in.
 650  *          2  :  buf = The url pattern to compile. Will be messed up.
 651  *
 652  * Returns     :  JB_ERR_OK - Success
 653  *                JB_ERR_MEMORY - Out of memory
 654  *                JB_ERR_PARSE - Cannot parse regex
 655  *
 656  *********************************************************************/
 657 static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
 658 {
 659    char *p;
 660
 661    p = strchr(buf, '/');
 662    if (NULL != p)
 663    {
 664       /*
 665        * Only compile the regex if it consists of more than
 666        * a single slash, otherwise it wouldn't affect the result.
 667        */
 668       if (p[1] != '\0')
 669       {
 670          /*
 671           * XXX: does it make sense to compile the slash at the beginning?
 672           */
 673          jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
 674
 675          if (JB_ERR_OK != err)
 676          {
 677             return err;
 678          }
 679       }
 680       *p = '\0';
 681    }
 682
 683    /*
 684     * IPv6 numeric hostnames can contain colons, thus we need
 685     * to delimit the hostname before the real port separator.
 686     * As brackets are already used in the hostname pattern,
 687     * we use angle brackets ('<', '>') instead.
 688     */
 689    if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
 690    {
 691       *p++ = '\0';
 692       buf++;
 693
 694       if (*p == '\0')
 695       {
 696          /* IPv6 address without port number */
 697          p = NULL;
 698       }
 699       else if (*p != ':')
 700       {
 701          /* Garbage after address delimiter */
 702          return JB_ERR_PARSE;
 703       }
 704    }
 705    else
 706    {
 707       p = strchr(buf, ':');
 708    }
 709
 710    if (NULL != p)
 711    {
 712       *p++ = '\0';
 713       url->pattern.url_spec.port_list = strdup_or_die(p);
 714    }
 715    else
 716    {
 717       url->pattern.url_spec.port_list = NULL;
 718    }
 719
 720    if (buf[0] != '\0')
 721    {
 722       return compile_host_pattern(url, buf);
 723    }
 724
 725    return JB_ERR_OK;
 726
 727 }
 728
 729
 730 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
 731 /*********************************************************************
 732  *
 733  * Function    :  compile_host_pattern
 734  *
 735  * Description :  Parses and compiles a host pattern.
 736  *
 737  * Parameters  :
 738  *          1  :  url = Target pattern_spec to be filled in.
 739  *          2  :  host_pattern = Host pattern to compile.
 740  *
 741  * Returns     :  JB_ERR_OK - Success
 742  *                JB_ERR_MEMORY - Out of memory
 743  *                JB_ERR_PARSE - Cannot parse regex
 744  *
 745  *********************************************************************/
 746 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 747 {
 748    return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
 749 }
 750
 751 #else
 752
 753 /*********************************************************************
 754  *
 755  * Function    :  compile_host_pattern
 756  *
 757  * Description :  Parses and "compiles" an old-school host pattern.
 758  *
 759  * Parameters  :
 760  *          1  :  url = Target pattern_spec to be filled in.
 761  *          2  :  host_pattern = Host pattern to parse.
 762  *
 763  * Returns     :  JB_ERR_OK - Success
 764  *                JB_ERR_PARSE - Cannot parse regex
 765  *
 766  *********************************************************************/
 767 static jb_err compile_host_pattern(struct pattern_spec *url, const char *host_pattern)
 768 {
 769    char *v[150];
 770    size_t size;
 771    char *p;
 772
 773    /*
 774     * Parse domain part
 775     */
 776    if (host_pattern[strlen(host_pattern) - 1] == '.')
 777    {
 778       url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
 779    }
 780    if (host_pattern[0] == '.')
 781    {
 782       url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
 783    }
 784
 785    /*
 786     * Split domain into components
 787     */
 788    url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
 789
 790    /*
 791     * Map to lower case
 792     */
 793    for (p = url->pattern.url_spec.dbuffer; *p ; p++)
 794    {
 795       *p = (char)privoxy_tolower(*p);
 796    }
 797
 798    /*
 799     * Split the domain name into components
 800     */
 801    url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
 802
 803    if (url->pattern.url_spec.dcount < 0)
 804    {
 805       free_pattern_spec(url);
 806       return JB_ERR_PARSE;
 807    }
 808    else if (url->pattern.url_spec.dcount != 0)
 809    {
 810       /*
 811        * Save a copy of the pointers in dvec
 812        */
 813       size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
 814
 815       url->pattern.url_spec.dvec = malloc_or_die(size);
 816
 817       memcpy(url->pattern.url_spec.dvec, v, size);
 818    }
 819    /*
 820     * else dcount == 0 in which case we needn't do anything,
 821     * since dvec will never be accessed and the pattern will
 822     * match all domains.
 823     */
 824    return JB_ERR_OK;
 825 }
 826
 827
 828 /*********************************************************************
 829  *
 830  * Function    :  simplematch
 831  *
 832  * Description :  String matching, with a (greedy) '*' wildcard that
 833  *                stands for zero or more arbitrary characters and
 834  *                character classes in [], which take both enumerations
 835  *                and ranges.
 836  *
 837  * Parameters  :
 838  *          1  :  pattern = pattern for matching
 839  *          2  :  text    = text to be matched
 840  *
 841  * Returns     :  0 if match, else nonzero
 842  *
 843  *********************************************************************/
 844 static int simplematch(const char *pattern, const char *text)
 845 {
 846    const unsigned char *pat = (const unsigned char *)pattern;
 847    const unsigned char *txt = (const unsigned char *)text;
 848    const unsigned char *fallback = pat;
 849    int wildcard = 0;
 850
 851    unsigned char lastchar = 'a';
 852    unsigned i;
 853    unsigned char charmap[32];
 854
 855    while (*txt)
 856    {
 857
 858       /* EOF pattern but !EOF text? */
 859       if (*pat == '\0')
 860       {
 861          if (wildcard)
 862          {
 863             pat = fallback;
 864          }
 865          else
 866          {
 867             return 1;
 868          }
 869       }
 870
 871       /* '*' in the pattern?  */
 872       if (*pat == '*')
 873       {
 874
 875          /* The pattern ends afterwards? Speed up the return. */
 876          if (*++pat == '\0')
 877          {
 878             return 0;
 879          }
 880
 881          /* Else, set wildcard mode and remember position after '*' */
 882          wildcard = 1;
 883          fallback = pat;
 884       }
 885
 886       /* Character range specification? */
 887       if (*pat == '[')
 888       {
 889          memset(charmap, '\0', sizeof(charmap));
 890
 891          while (*++pat != ']')
 892          {
 893             if (!*pat)
 894             {
 895                return 1;
 896             }
 897             else if (*pat == '-')
 898             {
 899                if ((*++pat == ']') || *pat == '\0')
 900                {
 901                   return(1);
 902                }
 903                for (i = lastchar; i <= *pat; i++)
 904                {
 905                   charmap[i / 8] |= (unsigned char)(1 << (i % 8));
 906                }
 907             }
 908             else
 909             {
 910                charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
 911                lastchar = *pat;
 912             }
 913          }
 914       } /* -END- if Character range specification */
 915
 916
 917       /*
 918        * Char match, or char range match?
 919        */
 920       if ((*pat == *txt)
 921        || (*pat == '?')
 922        || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
 923       {
 924          /*
 925           * Success: Go ahead
 926           */
 927          pat++;
 928       }
 929       else if (!wildcard)
 930       {
 931          /*
 932           * No match && no wildcard: No luck
 933           */
 934          return 1;
 935       }
 936       else if (pat != fallback)
 937       {
 938          /*
 939           * Increment text pointer if in char range matching
 940           */
 941          if (*pat == ']')
 942          {
 943             txt++;
 944          }
 945          /*
 946           * Wildcard mode && nonmatch beyond fallback: Rewind pattern
 947           */
 948          pat = fallback;
 949          /*
 950           * Restart matching from current text pointer
 951           */
 952          continue;
 953       }
 954       txt++;
 955    }
 956
 957    /* Cut off extra '*'s */
 958    if (*pat == '*') pat++;
 959
 960    /* If this is the pattern's end, fine! */
 961    return(*pat);
 962
 963 }
 964
 965
 966 /*********************************************************************
 967  *
 968  * Function    :  simple_domaincmp
 969  *
 970  * Description :  Domain-wise Compare fqdn's.  The comparison is
 971  *                both left- and right-anchored.  The individual
 972  *                domain names are compared with simplematch().
 973  *                This is only used by domain_match.
 974  *
 975  * Parameters  :
 976  *          1  :  pv = array of patterns to compare
 977  *          2  :  fv = array of domain components to compare
 978  *          3  :  len = length of the arrays (both arrays are the
 979  *                      same length - if they weren't, it couldn't
 980  *                      possibly be a match).
 981  *
 982  * Returns     :  0 => domains are equivalent, else no match.
 983  *
 984  *********************************************************************/
 985 static int simple_domaincmp(char **pv, char **fv, int len)
 986 {
 987    int n;
 988
 989    for (n = 0; n < len; n++)
 990    {
 991       if (simplematch(pv[n], fv[n]))
 992       {
 993          return 1;
 994       }
 995    }
 996
 997    return 0;
 998
 999 }
1000
1001
1002 /*********************************************************************
1003  *
1004  * Function    :  domain_match
1005  *
1006  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
1007  *                p.pattern->unachored, the comparison is un-, left-,
1008  *                right-anchored, or both.
1009  *                The individual domain names are compared with
1010  *                simplematch().
1011  *
1012  * Parameters  :
1013  *          1  :  p = a domain that may contain a '*' as a wildcard.
1014  *          2  :  fqdn = domain name against which the patterns are compared.
1015  *
1016  * Returns     :  0 => domains are equivalent, else no match.
1017  *
1018  *********************************************************************/
1019 static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
1020 {
1021    char **pv, **fv;  /* vectors  */
1022    int    plen, flen;
1023    int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
1024
1025    plen = p->pattern.url_spec.dcount;
1026    flen = fqdn->dcount;
1027
1028    if (flen < plen)
1029    {
1030       /* fqdn is too short to match this pattern */
1031       return 1;
1032    }
1033
1034    pv   = p->pattern.url_spec.dvec;
1035    fv   = fqdn->dvec;
1036
1037    if (unanchored == ANCHOR_LEFT)
1038    {
1039       /*
1040        * Right anchored.
1041        *
1042        * Convert this into a fully anchored pattern with
1043        * the fqdn and pattern the same length
1044        */
1045       fv += (flen - plen); /* flen - plen >= 0 due to check above */
1046       return simple_domaincmp(pv, fv, plen);
1047    }
1048    else if (unanchored == 0)
1049    {
1050       /* Fully anchored, check length */
1051       if (flen != plen)
1052       {
1053          return 1;
1054       }
1055       return simple_domaincmp(pv, fv, plen);
1056    }
1057    else if (unanchored == ANCHOR_RIGHT)
1058    {
1059       /* Left anchored, ignore all extra in fqdn */
1060       return simple_domaincmp(pv, fv, plen);
1061    }
1062    else
1063    {
1064       /* Unanchored */
1065       int n;
1066       int maxn = flen - plen;
1067       for (n = 0; n <= maxn; n++)
1068       {
1069          if (!simple_domaincmp(pv, fv, plen))
1070          {
1071             return 0;
1072          }
1073          /*
1074           * Doesn't match from start of fqdn
1075           * Try skipping first part of fqdn
1076           */
1077          fv++;
1078       }
1079       return 1;
1080    }
1081
1082 }
1083 #endif /* def FEATURE_EXTENDED_HOST_PATTERNS */
1084
1085
1086 /*********************************************************************
1087  *
1088  * Function    :  create_pattern_spec
1089  *
1090  * Description :  Creates a "pattern_spec" structure from a string.
1091  *                When finished, free with free_pattern_spec().
1092  *
1093  * Parameters  :
1094  *          1  :  pattern = Target pattern_spec to be filled in.
1095  *                          Will be zeroed before use.
1096  *          2  :  buf = Source pattern, null terminated.  NOTE: The
1097  *                      contents of this buffer are destroyed by this
1098  *                      function.  If this function succeeds, the
1099  *                      buffer is copied to pattern->spec.  If this
1100  *                      function fails, the contents of the buffer
1101  *                      are lost forever.
1102  *
1103  * Returns     :  JB_ERR_OK - Success
1104  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
1105  *                               written to system log)
1106  *
1107  *********************************************************************/
1108 jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
1109 {
1110    static const struct
1111    {
1112       /** The tag pattern prefix to match */
1113       const char *prefix;
1114
1115       /** The length of the prefix to match */
1116       const size_t prefix_length;
1117
1118       /** The pattern flag */
1119       const unsigned flag;
1120    } tag_pattern[] = {
1121       { "TAG:",              4, PATTERN_SPEC_TAG_PATTERN},
1122       { "NO-REQUEST-TAG:",  15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
1123       { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
1124    };
1125    int i;
1126
1127    assert(pattern);
1128    assert(buf);
1129
1130    memset(pattern, '\0', sizeof(*pattern));
1131
1132    /* Remember the original specification for the CGI pages. */
1133    pattern->spec = strdup_or_die(buf);
1134
1135    /* Check if it's a tag pattern */
1136    for (i = 0; i < SZ(tag_pattern); i++)
1137    {
1138       if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
1139       {
1140          /* The regex starts after the prefix */
1141          const char *tag_regex = buf + tag_pattern[i].prefix_length;
1142
1143          pattern->flags |= tag_pattern[i].flag;
1144
1145          return compile_pattern(tag_regex, NO_ANCHORING, pattern,
1146             &pattern->pattern.tag_regex);
1147       }
1148    }
1149
1150    /* If it isn't a tag pattern it must be an URL pattern. */
1151    pattern->flags |= PATTERN_SPEC_URL_PATTERN;
1152
1153    return compile_url_pattern(pattern, buf);
1154
1155 }
1156
1157
1158 /*********************************************************************
1159  *
1160  * Function    :  free_pattern_spec
1161  *
1162  * Description :  Called from the "unloaders".  Freez the pattern
1163  *                structure elements.
1164  *
1165  * Parameters  :
1166  *          1  :  pattern = pointer to a pattern_spec structure.
1167  *
1168  * Returns     :  N/A
1169  *
1170  *********************************************************************/
1171 void free_pattern_spec(struct pattern_spec *pattern)
1172 {
1173    if (pattern == NULL) return;
1174
1175    freez(pattern->spec);
1176 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1177    if (pattern->pattern.url_spec.host_regex)
1178    {
1179       regfree(pattern->pattern.url_spec.host_regex);
1180       freez(pattern->pattern.url_spec.host_regex);
1181    }
1182 #else
1183    freez(pattern->pattern.url_spec.dbuffer);
1184    freez(pattern->pattern.url_spec.dvec);
1185    pattern->pattern.url_spec.dcount = 0;
1186 #endif /* ndef FEATURE_EXTENDED_HOST_PATTERNS */
1187    freez(pattern->pattern.url_spec.port_list);
1188    if (pattern->pattern.url_spec.preg)
1189    {
1190       regfree(pattern->pattern.url_spec.preg);
1191       freez(pattern->pattern.url_spec.preg);
1192    }
1193    if (pattern->pattern.tag_regex)
1194    {
1195       regfree(pattern->pattern.tag_regex);
1196       freez(pattern->pattern.tag_regex);
1197    }
1198 }
1199
1200
1201 /*********************************************************************
1202  *
1203  * Function    :  port_matches
1204  *
1205  * Description :  Compares a port against a port list.
1206  *
1207  * Parameters  :
1208  *          1  :  port      = The port to check.
1209  *          2  :  port_list = The list of port to compare with.
1210  *
1211  * Returns     :  TRUE for yes, FALSE otherwise.
1212  *
1213  *********************************************************************/
1214 static int port_matches(const int port, const char *port_list)
1215 {
1216    return ((NULL == port_list) || match_portlist(port_list, port));
1217 }
1218
1219
1220 /*********************************************************************
1221  *
1222  * Function    :  host_matches
1223  *
1224  * Description :  Compares a host against a host pattern.
1225  *
1226  * Parameters  :
1227  *          1  :  url = The URL to match
1228  *          2  :  pattern = The URL pattern
1229  *
1230  * Returns     :  TRUE for yes, FALSE otherwise.
1231  *
1232  *********************************************************************/
1233 static int host_matches(const struct http_request *http,
1234                         const struct pattern_spec *pattern)
1235 {
1236 #ifdef FEATURE_EXTENDED_HOST_PATTERNS
1237    return ((NULL == pattern->pattern.url_spec.host_regex)
1238       || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0)));
1239 #else
1240    return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
1241 #endif
1242 }
1243
1244
1245 /*********************************************************************
1246  *
1247  * Function    :  path_matches
1248  *
1249  * Description :  Compares a path against a path pattern.
1250  *
1251  * Parameters  :
1252  *          1  :  path = The path to match
1253  *          2  :  pattern = The URL pattern
1254  *
1255  * Returns     :  TRUE for yes, FALSE otherwise.
1256  *
1257  *********************************************************************/
1258 static int path_matches(const char *path, const struct pattern_spec *pattern)
1259 {
1260    return ((NULL == pattern->pattern.url_spec.preg)
1261       || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
1262 }
1263
1264
1265 /*********************************************************************
1266  *
1267  * Function    :  url_match
1268  *
1269  * Description :  Compare a URL against a URL pattern.
1270  *
1271  * Parameters  :
1272  *          1  :  pattern = a URL pattern
1273  *          2  :  url = URL to match
1274  *
1275  * Returns     :  Nonzero if the URL matches the pattern, else 0.
1276  *
1277  *********************************************************************/
1278 int url_match(const struct pattern_spec *pattern,
1279               const struct http_request *http)
1280 {
1281    if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
1282    {
1283       /* It's not an URL pattern and thus shouldn't be matched against URLs */
1284       return 0;
1285    }
1286
1287    return (port_matches(http->port, pattern->pattern.url_spec.port_list)
1288       && host_matches(http, pattern) && path_matches(http->path, pattern));
1289
1290 }
1291
1292
1293 /*********************************************************************
1294  *
1295  * Function    :  match_portlist
1296  *
1297  * Description :  Check if a given number is covered by a comma
1298  *                separated list of numbers and ranges (a,b-c,d,..)
1299  *
1300  * Parameters  :
1301  *          1  :  portlist = String with list
1302  *          2  :  port = port to check
1303  *
1304  * Returns     :  0 => no match
1305  *                1 => match
1306  *
1307  *********************************************************************/
1308 int match_portlist(const char *portlist, int port)
1309 {
1310    char *min, *max, *next, *portlist_copy;
1311
1312    min = portlist_copy = strdup_or_die(portlist);
1313
1314    /*
1315     * Zero-terminate first item and remember offset for next
1316     */
1317    if (NULL != (next = strchr(portlist_copy, (int) ',')))
1318    {
1319       *next++ = '\0';
1320    }
1321
1322    /*
1323     * Loop through all items, checking for match
1324     */
1325    while (NULL != min)
1326    {
1327       if (NULL == (max = strchr(min, (int) '-')))
1328       {
1329          /*
1330           * No dash, check for equality
1331           */
1332          if (port == atoi(min))
1333          {
1334             freez(portlist_copy);
1335             return(1);
1336          }
1337       }
1338       else
1339       {
1340          /*
1341           * This is a range, so check if between min and max,
1342           * or, if max was omitted, between min and 65K
1343           */
1344          *max++ = '\0';
1345          if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
1346          {
1347             freez(portlist_copy);
1348             return(1);
1349          }
1350
1351       }
1352
1353       /*
1354        * Jump to next item
1355        */
1356       min = next;
1357
1358       /*
1359        * Zero-terminate next item and remember offset for n+1
1360        */
1361       if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
1362       {
1363          *next++ = '\0';
1364       }
1365    }
1366
1367    freez(portlist_copy);
1368    return 0;
1369
1370 }
1371
1372
1373 /*********************************************************************
1374  *
1375  * Function    :  parse_forwarder_address
1376  *
1377  * Description :  Parse out the host and port from a forwarder address.
1378  *
1379  * Parameters  :
1380  *          1  :  address = The forwarder address to parse.
1381  *          2  :  hostname = Used to return the hostname. NULL on error.
1382  *          3  :  port = Used to return the port. Untouched if no port
1383  *                       is specified.
1384  *
1385  * Returns     :  JB_ERR_OK on success
1386  *                JB_ERR_MEMORY on out of memory
1387  *                JB_ERR_PARSE on malformed address.
1388  *
1389  *********************************************************************/
1390 jb_err parse_forwarder_address(char *address, char **hostname, int *port)
1391 {
1392    char *p = address;
1393
1394    if ((*address == '[') && (NULL == strchr(address, ']')))
1395    {
1396       /* XXX: Should do some more validity checks here. */
1397       return JB_ERR_PARSE;
1398    }
1399
1400    *hostname = strdup_or_die(address);
1401
1402    if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
1403    {
1404       *p++ = '\0';
1405       memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
1406       if (*p == ':')
1407       {
1408          *port = (int)strtol(++p, NULL, 0);
1409       }
1410    }
1411    else if (NULL != (p = strchr(*hostname, ':')))
1412    {
1413       *p++ = '\0';
1414       *port = (int)strtol(p, NULL, 0);
1415    }
1416
1417    return JB_ERR_OK;
1418
1419 }
1420
1421
1422 /*
1423   Local Variables:
1424   tab-width: 3
1425   end:
1426 */