src/urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.10 2002/05/12 21:40:37 jongfoster Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001 the SourceForge
  10  *                Privoxy team. http://www.privoxy.org/
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.10  2002/05/12 21:40:37  jongfoster
  37  *    - Removing some unused code
  38  *
  39  *    Revision 1.9  2002/04/04 00:36:36  gliptak
  40  *    always use pcre for matching
  41  *
  42  *    Revision 1.8  2002/04/03 23:32:47  jongfoster
  43  *    Fixing memory leak on error
  44  *
  45  *    Revision 1.7  2002/03/26 22:29:55  swa
  46  *    we have a new homepage!
  47  *
  48  *    Revision 1.6  2002/03/24 13:25:43  swa
  49  *    name change related issues
  50  *
  51  *    Revision 1.5  2002/03/13 00:27:05  jongfoster
  52  *    Killing warnings
  53  *
  54  *    Revision 1.4  2002/03/07 03:46:17  oes
  55  *    Fixed compiler warnings
  56  *
  57  *    Revision 1.3  2002/03/03 14:51:11  oes
  58  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
  59  *
  60  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
  61  *    Correcting comment style
  62  *    Fixing an uninitialized memory bug in create_url_spec()
  63  *
  64  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
  65  *    Moving all our URL and URL pattern parsing code to the same file - it
  66  *    was scattered around in filters.c, loaders.c and parsers.c.
  67  *
  68  *    Providing a single, simple url_match(pattern,url) function - rather than
  69  *    the 3-line match routine which was repeated all over the place.
  70  *
  71  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
  72  *
  73  *    Providing parse_http_url() so that URLs can be parsed without faking a
  74  *    HTTP request line for parse_http_request() or repeating the parsing
  75  *    code (both of which were techniques that were actually in use).
  76  *
  77  *    Standardizing that struct http_request is used to represent a URL, and
  78  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
  79  *    represented as seperate variables and a partially-filled-in url_spec).
  80  *
  81  *
  82  *********************************************************************/
  83 \f
  84
  85 #include "config.h"
  86
  87 #ifndef _WIN32
  88 #include <stdio.h>
  89 #include <sys/types.h>
  90 #endif
  91
  92 #include <stdlib.h>
  93 #include <ctype.h>
  94 #include <assert.h>
  95 #include <string.h>
  96
  97 #if !defined(_WIN32) && !defined(__OS2__)
  98 #include <unistd.h>
  99 #endif
 100
 101 #include "project.h"
 102 #include "urlmatch.h"
 103 #include "ssplit.h"
 104 #include "miscutil.h"
 105 #include "errlog.h"
 106
 107 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
 108
 109
 110 /*********************************************************************
 111  *
 112  * Function    :  free_http_request
 113  *
 114  * Description :  Freez a http_request structure
 115  *
 116  * Parameters  :
 117  *          1  :  http = points to a http_request structure to free
 118  *
 119  * Returns     :  N/A
 120  *
 121  *********************************************************************/
 122 void free_http_request(struct http_request *http)
 123 {
 124    assert(http);
 125
 126    freez(http->cmd);
 127    freez(http->ocmd);
 128    freez(http->gpc);
 129    freez(http->host);
 130    freez(http->url);
 131    freez(http->hostport);
 132    freez(http->path);
 133    freez(http->ver);
 134    freez(http->host_ip_addr_str);
 135    freez(http->dbuffer);
 136    freez(http->dvec);
 137    http->dcount = 0;
 138 }
 139
 140
 141 /*********************************************************************
 142  *
 143  * Function    :  parse_http_url
 144  *
 145  * Description :  Parse out the host and port from the URL.  Find the
 146  *                hostname & path, port (if ':'), and/or password (if '@')
 147  *
 148  * Parameters  :
 149  *          1  :  url = URL (or is it URI?) to break down
 150  *          2  :  http = pointer to the http structure to hold elements.
 151  *                       Will be zeroed before use.  Note that this
 152  *                       function sets the http->gpc and http->ver
 153  *                       members to NULL.
 154  *          3  :  csp = Current client state (buffers, headers, etc...)
 155  *
 156  * Returns     :  JB_ERR_OK on success
 157  *                JB_ERR_MEMORY on out of memory
 158  *                JB_ERR_CGI_PARAMS on malformed command/URL
 159  *                                  or >100 domains deep.
 160  *
 161  *********************************************************************/
 162 jb_err parse_http_url(const char * url,
 163                       struct http_request *http,
 164                       struct client_state *csp)
 165 {
 166    /*
 167     * Zero out the results structure
 168     */
 169    memset(http, '\0', sizeof(*http));
 170
 171
 172    /*
 173     * Save our initial URL
 174     */
 175    http->url = strdup(url);
 176    if (http->url == NULL)
 177    {
 178       return JB_ERR_MEMORY;
 179    }
 180
 181
 182    /*
 183     * Split URL into protocol,hostport,path.
 184     */
 185    {
 186       char *buf;
 187       char *url_noproto;
 188       char *url_path;
 189
 190       buf = strdup(url);
 191       if (buf == NULL)
 192       {
 193          return JB_ERR_MEMORY;
 194       }
 195
 196       /* Find the start of the URL in our scratch space */
 197       url_noproto = buf;
 198       if (strncmpic(url_noproto, "http://",  7) == 0)
 199       {
 200          url_noproto += 7;
 201          http->ssl = 0;
 202       }
 203       else if (strncmpic(url_noproto, "https://", 8) == 0)
 204       {
 205          url_noproto += 8;
 206          http->ssl = 1;
 207       }
 208       else
 209       {
 210          http->ssl = 0;
 211       }
 212
 213       url_path = strchr(url_noproto, '/');
 214       if (url_path != NULL)
 215       {
 216          /*
 217           * Got a path.
 218           *
 219           * NOTE: The following line ignores the path for HTTPS URLS.
 220           * This means that you get consistent behaviour if you type a
 221           * https URL in and it's parsed by the function.  (When the
 222           * URL is actually retrieved, SSL hides the path part).
 223           */
 224          http->path = strdup(http->ssl ? "/" : url_path);
 225          *url_path = '\0';
 226          http->hostport = strdup(url_noproto);
 227       }
 228       else
 229       {
 230          /*
 231           * Repair broken HTTP requests that don't contain a path,
 232           * or CONNECT requests
 233           */
 234          http->path = strdup("/");
 235          http->hostport = strdup(url_noproto);
 236       }
 237
 238       free(buf);
 239
 240       if ( (http->path == NULL)
 241         || (http->hostport == NULL))
 242       {
 243          free(buf);
 244          free_http_request(http);
 245          return JB_ERR_MEMORY;
 246       }
 247    }
 248
 249
 250    /*
 251     * Split hostport into user/password (ignored), host, port.
 252     */
 253    {
 254       char *buf;
 255       char *host;
 256       char *port;
 257
 258       buf = strdup(http->hostport);
 259       if (buf == NULL)
 260       {
 261          free_http_request(http);
 262          return JB_ERR_MEMORY;
 263       }
 264
 265       /* check if url contains username and/or password */
 266       host = strchr(buf, '@');
 267       if (host != NULL)
 268       {
 269          /* Contains username/password, skip it and the @ sign. */
 270          host++;
 271       }
 272       else
 273       {
 274          /* No username or password. */
 275          host = buf;
 276       }
 277
 278       /* check if url contains port */
 279       port = strchr(host, ':');
 280       if (port != NULL)
 281       {
 282          /* Contains port */
 283          /* Terminate hostname and point to start of port string */
 284          *port++ = '\0';
 285          http->port = atoi(port);
 286       }
 287       else
 288       {
 289          /* No port specified. */
 290          http->port = (http->ssl ? 143 : 80);
 291       }
 292
 293       http->host = strdup(host);
 294
 295       free(buf);
 296
 297       if (http->host == NULL)
 298       {
 299          free_http_request(http);
 300          return JB_ERR_MEMORY;
 301       }
 302    }
 303
 304
 305    /*
 306     * Split domain name so we can compare it against wildcards
 307     */
 308    {
 309       char *vec[BUFFER_SIZE];
 310       size_t size;
 311       char *p;
 312
 313       http->dbuffer = strdup(http->host);
 314       if (NULL == http->dbuffer)
 315       {
 316          free_http_request(http);
 317          return JB_ERR_MEMORY;
 318       }
 319
 320       /* map to lower case */
 321       for (p = http->dbuffer; *p ; p++)
 322       {
 323          *p = tolower((int)(unsigned char)*p);
 324       }
 325
 326       /* split the domain name into components */
 327       http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 328
 329       if (http->dcount <= 0)
 330       {
 331          /*
 332           * Error: More than SZ(vec) components in domain
 333           *    or: no components in domain
 334           */
 335          free_http_request(http);
 336          return JB_ERR_PARSE;
 337       }
 338
 339       /* save a copy of the pointers in dvec */
 340       size = http->dcount * sizeof(*http->dvec);
 341
 342       http->dvec = (char **)malloc(size);
 343       if (NULL == http->dvec)
 344       {
 345          free_http_request(http);
 346          return JB_ERR_MEMORY;
 347       }
 348
 349       memcpy(http->dvec, vec, size);
 350    }
 351
 352
 353    return JB_ERR_OK;
 354 }
 355
 356
 357 /*********************************************************************
 358  *
 359  * Function    :  parse_http_request
 360  *
 361  * Description :  Parse out the host and port from the URL.  Find the
 362  *                hostname & path, port (if ':'), and/or password (if '@')
 363  *
 364  * Parameters  :
 365  *          1  :  req = HTTP request line to break down
 366  *          2  :  http = pointer to the http structure to hold elements
 367  *          3  :  csp = Current client state (buffers, headers, etc...)
 368  *
 369  * Returns     :  JB_ERR_OK on success
 370  *                JB_ERR_MEMORY on out of memory
 371  *                JB_ERR_CGI_PARAMS on malformed command/URL
 372  *                                  or >100 domains deep.
 373  *
 374  *********************************************************************/
 375 jb_err parse_http_request(const char *req,
 376                           struct http_request *http,
 377                           struct client_state *csp)
 378 {
 379    char *buf;
 380    char *v[10];
 381    int n;
 382    jb_err err;
 383    int is_connect = 0;
 384
 385    memset(http, '\0', sizeof(*http));
 386
 387    buf = strdup(req);
 388    if (buf == NULL)
 389    {
 390       return JB_ERR_MEMORY;
 391    }
 392
 393    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 394    if (n != 3)
 395    {
 396       free(buf);
 397       return JB_ERR_PARSE;
 398    }
 399
 400    /* this could be a CONNECT request */
 401    if (strcmpic(v[0], "connect") == 0)
 402    {
 403       /* Secure */
 404       is_connect = 1;
 405    }
 406    /* or it could be any other basic HTTP request type */
 407    else if ((0 == strcmpic(v[0], "get"))
 408          || (0 == strcmpic(v[0], "head"))
 409          || (0 == strcmpic(v[0], "post"))
 410          || (0 == strcmpic(v[0], "put"))
 411          || (0 == strcmpic(v[0], "delete"))
 412
 413          /* or a webDAV extension (RFC2518) */
 414          || (0 == strcmpic(v[0], "propfind"))
 415          || (0 == strcmpic(v[0], "proppatch"))
 416          || (0 == strcmpic(v[0], "move"))
 417          || (0 == strcmpic(v[0], "copy"))
 418          || (0 == strcmpic(v[0], "mkcol"))
 419          || (0 == strcmpic(v[0], "lock"))
 420          || (0 == strcmpic(v[0], "unlock"))
 421          )
 422    {
 423       /* Normal */
 424       is_connect = 0;
 425    }
 426    else
 427    {
 428       /* Unknown HTTP method */
 429       free(buf);
 430       return JB_ERR_PARSE;
 431    }
 432
 433    err = parse_http_url(v[1], http, csp);
 434    if (err)
 435    {
 436       free(buf);
 437       return err;
 438    }
 439
 440    /*
 441     * Copy the details into the structure
 442     */
 443    http->ssl = is_connect;
 444    http->cmd = strdup(req);
 445    http->gpc = strdup(v[0]);
 446    http->ver = strdup(v[2]);
 447
 448    if ( (http->cmd == NULL)
 449      || (http->gpc == NULL)
 450      || (http->ver == NULL) )
 451    {
 452       free(buf);
 453       free_http_request(http);
 454       return JB_ERR_MEMORY;
 455    }
 456
 457    return JB_ERR_OK;
 458 }
 459
 460
 461 /*********************************************************************
 462  *
 463  * Function    :  simple_domaincmp
 464  *
 465  * Description :  Domain-wise Compare fqdn's.  The comparison is
 466  *                both left- and right-anchored.  The individual
 467  *                domain names are compared with simplematch().
 468  *                This is only used by domain_match.
 469  *
 470  * Parameters  :
 471  *          1  :  pv = array of patterns to compare
 472  *          2  :  fv = array of domain components to compare
 473  *          3  :  len = length of the arrays (both arrays are the
 474  *                      same length - if they weren't, it couldn't
 475  *                      possibly be a match).
 476  *
 477  * Returns     :  0 => domains are equivalent, else no match.
 478  *
 479  *********************************************************************/
 480 static int simple_domaincmp(char **pv, char **fv, int len)
 481 {
 482    int n;
 483
 484    for (n = 0; n < len; n++)
 485    {
 486       if (simplematch(pv[n], fv[n]))
 487       {
 488          return 1;
 489       }
 490    }
 491
 492    return 0;
 493
 494 }
 495
 496
 497 /*********************************************************************
 498  *
 499  * Function    :  domain_match
 500  *
 501  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 502  *                pattern->unachored, the comparison is un-, left-,
 503  *                right-anchored, or both.
 504  *                The individual domain names are compared with
 505  *                simplematch().
 506  *
 507  * Parameters  :
 508  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 509  *          2  :  fqdn = domain name against which the patterns are compared.
 510  *
 511  * Returns     :  0 => domains are equivalent, else no match.
 512  *
 513  *********************************************************************/
 514 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 515 {
 516    char **pv, **fv;  /* vectors  */
 517    int    plen, flen;
 518    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 519
 520    plen = pattern->dcount;
 521    flen = fqdn->dcount;
 522
 523    if (flen < plen)
 524    {
 525       /* fqdn is too short to match this pattern */
 526       return 1;
 527    }
 528
 529    pv   = pattern->dvec;
 530    fv   = fqdn->dvec;
 531
 532    if (unanchored == ANCHOR_LEFT)
 533    {
 534       /*
 535        * Right anchored.
 536        *
 537        * Convert this into a fully anchored pattern with
 538        * the fqdn and pattern the same length
 539        */
 540       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 541       return simple_domaincmp(pv, fv, plen);
 542    }
 543    else if (unanchored == 0)
 544    {
 545       /* Fully anchored, check length */
 546       if (flen != plen)
 547       {
 548          return 1;
 549       }
 550       return simple_domaincmp(pv, fv, plen);
 551    }
 552    else if (unanchored == ANCHOR_RIGHT)
 553    {
 554       /* Left anchored, ignore all extra in fqdn */
 555       return simple_domaincmp(pv, fv, plen);
 556    }
 557    else
 558    {
 559       /* Unanchored */
 560       int n;
 561       int maxn = flen - plen;
 562       for (n = 0; n <= maxn; n++)
 563       {
 564          if (!simple_domaincmp(pv, fv, plen))
 565          {
 566             return 0;
 567          }
 568          /*
 569           * Doesn't match from start of fqdn
 570           * Try skipping first part of fqdn
 571           */
 572          fv++;
 573       }
 574       return 1;
 575    }
 576
 577 }
 578
 579
 580 /*********************************************************************
 581  *
 582  * Function    :  create_url_spec
 583  *
 584  * Description :  Creates a "url_spec" structure from a string.
 585  *                When finished, free with unload_url().
 586  *
 587  * Parameters  :
 588  *          1  :  url = Target url_spec to be filled in.  Will be
 589  *                      zeroed before use.
 590  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 591  *                      contents of this buffer are destroyed by this
 592  *                      function.  If this function succeeds, the
 593  *                      buffer is copied to url->spec.  If this
 594  *                      function fails, the contents of the buffer
 595  *                      are lost forever.
 596  *
 597  * Returns     :  JB_ERR_OK - Success
 598  *                JB_ERR_MEMORY - Out of memory
 599  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 600  *                               written to system log)
 601  *
 602  *********************************************************************/
 603 jb_err create_url_spec(struct url_spec * url, const char * buf)
 604 {
 605    char *p;
 606
 607    assert(url);
 608    assert(buf);
 609
 610    /* Zero memory */
 611    memset(url, '\0', sizeof(*url));
 612
 613    /* save a copy of the orignal specification */
 614    if ((url->spec = strdup(buf)) == NULL)
 615    {
 616       return JB_ERR_MEMORY;
 617    }
 618
 619    if ((p = strchr(buf, '/')) != NULL)
 620    {
 621       if (NULL == (url->path = strdup(p)))
 622       {
 623          freez(url->spec);
 624          return JB_ERR_MEMORY;
 625       }
 626       url->pathlen = strlen(url->path);
 627       *p = '\0';
 628    }
 629    else
 630    {
 631       url->path    = NULL;
 632       url->pathlen = 0;
 633    }
 634    if (url->path)
 635    {
 636       int errcode;
 637       char rebuf[BUFFER_SIZE];
 638
 639       if (NULL == (url->preg = zalloc(sizeof(*url->preg))))
 640       {
 641          freez(url->spec);
 642          freez(url->path);
 643          return JB_ERR_MEMORY;
 644       }
 645
 646       sprintf(rebuf, "^(%s)", url->path);
 647
 648       errcode = regcomp(url->preg, rebuf,
 649             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 650       if (errcode)
 651       {
 652          size_t errlen = regerror(errcode,
 653             url->preg, rebuf, sizeof(rebuf));
 654
 655          if (errlen > (sizeof(rebuf) - (size_t)1))
 656          {
 657             errlen = sizeof(rebuf) - (size_t)1;
 658          }
 659          rebuf[errlen] = '\0';
 660
 661          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
 662             url->spec, rebuf);
 663
 664          freez(url->spec);
 665          freez(url->path);
 666          regfree(url->preg);
 667          freez(url->preg);
 668
 669          return JB_ERR_PARSE;
 670       }
 671    }
 672    if ((p = strchr(buf, ':')) == NULL)
 673    {
 674       url->port = 0;
 675    }
 676    else
 677    {
 678       *p++ = '\0';
 679       url->port = atoi(p);
 680    }
 681
 682    if (buf[0] != '\0')
 683    {
 684       char *v[150];
 685       size_t size;
 686
 687       /* Parse domain part */
 688       if (buf[strlen(buf) - 1] == '.')
 689       {
 690          url->unanchored |= ANCHOR_RIGHT;
 691       }
 692       if (buf[0] == '.')
 693       {
 694          url->unanchored |= ANCHOR_LEFT;
 695       }
 696
 697       /* split domain into components */
 698
 699       url->dbuffer = strdup(buf);
 700       if (NULL == url->dbuffer)
 701       {
 702          freez(url->spec);
 703          freez(url->path);
 704          regfree(url->preg);
 705          freez(url->preg);
 706          return JB_ERR_MEMORY;
 707       }
 708
 709       /* map to lower case */
 710       for (p = url->dbuffer; *p ; p++)
 711       {
 712          *p = tolower((int)(unsigned char)*p);
 713       }
 714
 715       /* split the domain name into components */
 716       url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 717
 718       if (url->dcount < 0)
 719       {
 720          freez(url->spec);
 721          freez(url->path);
 722          regfree(url->preg);
 723          freez(url->preg);
 724          freez(url->dbuffer);
 725          url->dcount = 0;
 726          return JB_ERR_MEMORY;
 727       }
 728       else if (url->dcount != 0)
 729       {
 730
 731          /* save a copy of the pointers in dvec */
 732          size = url->dcount * sizeof(*url->dvec);
 733
 734          url->dvec = (char **)malloc(size);
 735          if (NULL == url->dvec)
 736          {
 737             freez(url->spec);
 738             freez(url->path);
 739             regfree(url->preg);
 740             freez(url->preg);
 741             freez(url->dbuffer);
 742             url->dcount = 0;
 743             return JB_ERR_MEMORY;
 744          }
 745
 746          memcpy(url->dvec, v, size);
 747       }
 748    }
 749
 750    return JB_ERR_OK;
 751
 752 }
 753
 754
 755 /*********************************************************************
 756  *
 757  * Function    :  free_url_spec
 758  *
 759  * Description :  Called from the "unloaders".  Freez the url
 760  *                structure elements.
 761  *
 762  * Parameters  :
 763  *          1  :  url = pointer to a url_spec structure.
 764  *
 765  * Returns     :  N/A
 766  *
 767  *********************************************************************/
 768 void free_url_spec(struct url_spec *url)
 769 {
 770    if (url == NULL) return;
 771
 772    freez(url->spec);
 773    freez(url->dbuffer);
 774    freez(url->dvec);
 775    freez(url->path);
 776    if (url->preg)
 777    {
 778       regfree(url->preg);
 779       freez(url->preg);
 780    }
 781 }
 782
 783
 784 /*********************************************************************
 785  *
 786  * Function    :  url_match
 787  *
 788  * Description :  Compare a URL against a URL pattern.
 789  *
 790  * Parameters  :
 791  *          1  :  pattern = a URL pattern
 792  *          2  :  url = URL to match
 793  *
 794  * Returns     :  0 iff the URL matches the pattern, else nonzero.
 795  *
 796  *********************************************************************/
 797 int url_match(const struct url_spec *pattern,
 798               const struct http_request *url)
 799 {
 800    return ((pattern->port == 0) || (pattern->port == url->port))
 801        && ((pattern->dbuffer == NULL) || (domain_match(pattern, url) == 0))
 802        && ((pattern->path == NULL) ||
 803             (regexec(pattern->preg, url->path, 0, NULL, 0) == 0)
 804       );
 805 }
 806
 807
 808 /*
 809   Local Variables:
 810   tab-width: 3
 811   end:
 812 */