urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.3 2002/03/03 14:51:11 oes Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001 the SourceForge
  10  *                IJBSWA team.  http://ijbswa.sourceforge.net
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.3  2002/03/03 14:51:11  oes
  37  *    Fixed CLF logging: Added ocmd member for client's request to struct http_request
  38  *
  39  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
  40  *    Correcting comment style
  41  *    Fixing an uninitialized memory bug in create_url_spec()
  42  *
  43  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
  44  *    Moving all our URL and URL pattern parsing code to the same file - it
  45  *    was scattered around in filters.c, loaders.c and parsers.c.
  46  *
  47  *    Providing a single, simple url_match(pattern,url) function - rather than
  48  *    the 3-line match routine which was repeated all over the place.
  49  *
  50  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
  51  *
  52  *    Providing parse_http_url() so that URLs can be parsed without faking a
  53  *    HTTP request line for parse_http_request() or repeating the parsing
  54  *    code (both of which were techniques that were actually in use).
  55  *
  56  *    Standardizing that struct http_request is used to represent a URL, and
  57  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
  58  *    represented as seperate variables and a partially-filled-in url_spec).
  59  *
  60  *
  61  *********************************************************************/
  62 \f
  63
  64 #include "config.h"
  65
  66 #ifndef _WIN32
  67 #include <stdio.h>
  68 #include <sys/types.h>
  69 #endif
  70
  71 #include <stdlib.h>
  72 #include <ctype.h>
  73 #include <assert.h>
  74 #include <string.h>
  75
  76 #if !defined(_WIN32) && !defined(__OS2__)
  77 #include <unistd.h>
  78 #endif
  79
  80 #include "project.h"
  81 #include "urlmatch.h"
  82 #include "ssplit.h"
  83 #include "miscutil.h"
  84 #include "errlog.h"
  85
  86 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  87
  88 /* Fix a problem with Solaris.  There should be no effect on other
  89  * platforms.
  90  * Solaris's isspace() is a macro which uses it's argument directly
  91  * as an array index.  Therefore we need to make sure that high-bit
  92  * characters generate +ve values, and ideally we also want to make
  93  * the argument match the declared parameter type of "int".
  94  *
  95  * Why did they write a character function that can't take a simple
  96  * "char" argument?  Doh!
  97  */
  98 #define ijb_isupper(__X) isupper((int)(unsigned char)(__X))
  99 #define ijb_tolower(__X) tolower((int)(unsigned char)(__X))
 100
 101
 102 /*********************************************************************
 103  *
 104  * Function    :  free_http_request
 105  *
 106  * Description :  Freez a http_request structure
 107  *
 108  * Parameters  :
 109  *          1  :  http = points to a http_request structure to free
 110  *
 111  * Returns     :  N/A
 112  *
 113  *********************************************************************/
 114 void free_http_request(struct http_request *http)
 115 {
 116    assert(http);
 117
 118    freez(http->cmd);
 119    freez(http->ocmd);
 120    freez(http->gpc);
 121    freez(http->host);
 122    freez(http->url);
 123    freez(http->hostport);
 124    freez(http->path);
 125    freez(http->ver);
 126    freez(http->host_ip_addr_str);
 127    freez(http->dbuffer);
 128    freez(http->dvec);
 129    http->dcount = 0;
 130 }
 131
 132
 133 /*********************************************************************
 134  *
 135  * Function    :  parse_http_url
 136  *
 137  * Description :  Parse out the host and port from the URL.  Find the
 138  *                hostname & path, port (if ':'), and/or password (if '@')
 139  *
 140  * Parameters  :
 141  *          1  :  url = URL (or is it URI?) to break down
 142  *          2  :  http = pointer to the http structure to hold elements.
 143  *                       Will be zeroed before use.  Note that this
 144  *                       function sets the http->gpc and http->ver
 145  *                       members to NULL.
 146  *          3  :  csp = Current client state (buffers, headers, etc...)
 147  *
 148  * Returns     :  JB_ERR_OK on success
 149  *                JB_ERR_MEMORY on out of memory
 150  *                JB_ERR_CGI_PARAMS on malformed command/URL
 151  *                                  or >100 domains deep.
 152  *
 153  *********************************************************************/
 154 jb_err parse_http_url(const char * url,
 155                       struct http_request *http,
 156                       struct client_state *csp)
 157 {
 158    /*
 159     * Zero out the results structure
 160     */
 161    memset(http, '\0', sizeof(*http));
 162
 163
 164    /*
 165     * Save our initial URL
 166     */
 167    http->url = strdup(url);
 168    if (http->url == NULL)
 169    {
 170       return JB_ERR_MEMORY;
 171    }
 172
 173
 174    /*
 175     * Split URL into protocol,hostport,path.
 176     */
 177    {
 178       char *buf;
 179       char *url_noproto;
 180       char *url_path;
 181
 182       buf = strdup(url);
 183       if (buf == NULL)
 184       {
 185          return JB_ERR_MEMORY;
 186       }
 187
 188       /* Find the start of the URL in our scratch space */
 189       url_noproto = buf;
 190       if (strncmpic(url_noproto, "http://",  7) == 0)
 191       {
 192          url_noproto += 7;
 193          http->ssl = 0;
 194       }
 195       else if (strncmpic(url_noproto, "https://", 8) == 0)
 196       {
 197          url_noproto += 8;
 198          http->ssl = 1;
 199       }
 200       else
 201       {
 202          http->ssl = 0;
 203       }
 204
 205       url_path = strchr(url_noproto, '/');
 206       if (url_path != NULL)
 207       {
 208          /*
 209           * Got a path.
 210           *
 211           * NOTE: The following line ignores the path for HTTPS URLS.
 212           * This means that you get consistent behaviour if you type a
 213           * https URL in and it's parsed by the function.  (When the
 214           * URL is actually retrieved, SSL hides the path part).
 215           */
 216          http->path = strdup(http->ssl ? "/" : url_path);
 217          *url_path = '\0';
 218          http->hostport = strdup(url_noproto);
 219       }
 220       else
 221       {
 222          /*
 223           * Repair broken HTTP requests that don't contain a path,
 224           * or CONNECT requests
 225           */
 226          http->path = strdup("/");
 227          http->hostport = strdup(url_noproto);
 228       }
 229
 230       free(buf);
 231
 232       if ( (http->path == NULL)
 233         || (http->hostport == NULL))
 234       {
 235          free(buf);
 236          free_http_request(http);
 237          return JB_ERR_MEMORY;
 238       }
 239    }
 240
 241
 242    /*
 243     * Split hostport into user/password (ignored), host, port.
 244     */
 245    {
 246       char *buf;
 247       char *host;
 248       char *port;
 249
 250       buf = strdup(http->hostport);
 251       if (buf == NULL)
 252       {
 253          free_http_request(http);
 254          return JB_ERR_MEMORY;
 255       }
 256
 257       /* check if url contains username and/or password */
 258       host = strchr(buf, '@');
 259       if (host != NULL)
 260       {
 261          /* Contains username/password, skip it and the @ sign. */
 262          host++;
 263       }
 264       else
 265       {
 266          /* No username or password. */
 267          host = buf;
 268       }
 269
 270       /* check if url contains port */
 271       port = strchr(host, ':');
 272       if (port != NULL)
 273       {
 274          /* Contains port */
 275          /* Terminate hostname and point to start of port string */
 276          *port++ = '\0';
 277          http->port = atoi(port);
 278       }
 279       else
 280       {
 281          /* No port specified. */
 282          http->port = (http->ssl ? 143 : 80);
 283       }
 284
 285       http->host = strdup(host);
 286
 287       free(buf);
 288
 289       if (http->host == NULL)
 290       {
 291          free_http_request(http);
 292          return JB_ERR_MEMORY;
 293       }
 294    }
 295
 296
 297    /*
 298     * Split domain name so we can compare it against wildcards
 299     */
 300    {
 301       char *vec[BUFFER_SIZE];
 302       size_t size;
 303       char *p;
 304
 305       http->dbuffer = strdup(http->host);
 306       if (NULL == http->dbuffer)
 307       {
 308          free_http_request(http);
 309          return JB_ERR_MEMORY;
 310       }
 311
 312       /* map to lower case */
 313       for (p = http->dbuffer; *p ; p++)
 314       {
 315          *p = tolower((int)(unsigned char)*p);
 316       }
 317
 318       /* split the domain name into components */
 319       http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 320
 321       if (http->dcount <= 0)
 322       {
 323          /*
 324           * Error: More than SZ(vec) components in domain
 325           *    or: no components in domain
 326           */
 327          free_http_request(http);
 328          return JB_ERR_PARSE;
 329       }
 330
 331       /* save a copy of the pointers in dvec */
 332       size = http->dcount * sizeof(*http->dvec);
 333
 334       http->dvec = (char **)malloc(size);
 335       if (NULL == http->dvec)
 336       {
 337          free_http_request(http);
 338          return JB_ERR_MEMORY;
 339       }
 340
 341       memcpy(http->dvec, vec, size);
 342    }
 343
 344
 345    return JB_ERR_OK;
 346 }
 347
 348
 349 /*********************************************************************
 350  *
 351  * Function    :  parse_http_request
 352  *
 353  * Description :  Parse out the host and port from the URL.  Find the
 354  *                hostname & path, port (if ':'), and/or password (if '@')
 355  *
 356  * Parameters  :
 357  *          1  :  req = HTTP request line to break down
 358  *          2  :  http = pointer to the http structure to hold elements
 359  *          3  :  csp = Current client state (buffers, headers, etc...)
 360  *
 361  * Returns     :  JB_ERR_OK on success
 362  *                JB_ERR_MEMORY on out of memory
 363  *                JB_ERR_CGI_PARAMS on malformed command/URL
 364  *                                  or >100 domains deep.
 365  *
 366  *********************************************************************/
 367 jb_err parse_http_request(const char *req,
 368                           struct http_request *http,
 369                           struct client_state *csp)
 370 {
 371    char *buf;
 372    char *v[10];
 373    int n;
 374    jb_err err;
 375    int is_connect = 0;
 376
 377    memset(http, '\0', sizeof(*http));
 378
 379    buf = strdup(req);
 380    if (buf == NULL)
 381    {
 382       return JB_ERR_MEMORY;
 383    }
 384
 385    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 386    if (n != 3)
 387    {
 388       free(buf);
 389       return JB_ERR_PARSE;
 390    }
 391
 392    /* this could be a CONNECT request */
 393    if (strcmpic(v[0], "connect") == 0)
 394    {
 395       /* Secure */
 396       is_connect = 1;
 397    }
 398    /* or it could be any other basic HTTP request type */
 399    else if ((0 == strcmpic(v[0], "get"))
 400          || (0 == strcmpic(v[0], "head"))
 401          || (0 == strcmpic(v[0], "post"))
 402          || (0 == strcmpic(v[0], "put"))
 403          || (0 == strcmpic(v[0], "delete"))
 404
 405          /* or a webDAV extension (RFC2518) */
 406          || (0 == strcmpic(v[0], "propfind"))
 407          || (0 == strcmpic(v[0], "proppatch"))
 408          || (0 == strcmpic(v[0], "move"))
 409          || (0 == strcmpic(v[0], "copy"))
 410          || (0 == strcmpic(v[0], "mkcol"))
 411          || (0 == strcmpic(v[0], "lock"))
 412          || (0 == strcmpic(v[0], "unlock"))
 413          )
 414    {
 415       /* Normal */
 416       is_connect = 0;
 417    }
 418    else
 419    {
 420       /* Unknown HTTP method */
 421       free(buf);
 422       return JB_ERR_PARSE;
 423    }
 424
 425    err = parse_http_url(v[1], http, csp);
 426    if (err)
 427    {
 428       free(buf);
 429       return err;
 430    }
 431
 432    /*
 433     * Copy the details into the structure
 434     */
 435    http->ssl = is_connect;
 436    http->cmd = strdup(req);
 437    http->gpc = strdup(v[0]);
 438    http->ver = strdup(v[2]);
 439
 440    if ( (http->cmd == NULL)
 441      || (http->gpc == NULL)
 442      || (http->ver == NULL) )
 443    {
 444       free(buf);
 445       free_http_request(http);
 446       return JB_ERR_MEMORY;
 447    }
 448
 449    return JB_ERR_OK;
 450 }
 451
 452
 453 /*********************************************************************
 454  *
 455  * Function    :  simple_domaincmp
 456  *
 457  * Description :  Domain-wise Compare fqdn's.  The comparison is
 458  *                both left- and right-anchored.  The individual
 459  *                domain names are compared with simplematch().
 460  *                This is only used by domain_match.
 461  *
 462  * Parameters  :
 463  *          1  :  pv = array of patterns to compare
 464  *          2  :  fv = array of domain components to compare
 465  *          3  :  len = length of the arrays (both arrays are the
 466  *                      same length - if they weren't, it couldn't
 467  *                      possibly be a match).
 468  *
 469  * Returns     :  0 => domains are equivalent, else no match.
 470  *
 471  *********************************************************************/
 472 static int simple_domaincmp(char **pv, char **fv, int len)
 473 {
 474    int n;
 475
 476    for (n = 0; n < len; n++)
 477    {
 478       if (simplematch(pv[n], fv[n]))
 479       {
 480          return 1;
 481       }
 482    }
 483
 484    return 0;
 485
 486 }
 487
 488
 489 /*********************************************************************
 490  *
 491  * Function    :  domain_match
 492  *
 493  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 494  *                pattern->unachored, the comparison is un-, left-,
 495  *                right-anchored, or both.
 496  *                The individual domain names are compared with
 497  *                simplematch().
 498  *
 499  * Parameters  :
 500  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 501  *          2  :  fqdn = domain name against which the patterns are compared.
 502  *
 503  * Returns     :  0 => domains are equivalent, else no match.
 504  *
 505  *********************************************************************/
 506 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 507 {
 508    char **pv, **fv;  /* vectors  */
 509    int    plen, flen;
 510    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 511
 512    plen = pattern->dcount;
 513    flen = fqdn->dcount;
 514
 515    if (flen < plen)
 516    {
 517       /* fqdn is too short to match this pattern */
 518       return 1;
 519    }
 520
 521    pv   = pattern->dvec;
 522    fv   = fqdn->dvec;
 523
 524    if (unanchored == ANCHOR_LEFT)
 525    {
 526       /*
 527        * Right anchored.
 528        *
 529        * Convert this into a fully anchored pattern with
 530        * the fqdn and pattern the same length
 531        */
 532       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 533       return simple_domaincmp(pv, fv, plen);
 534    }
 535    else if (unanchored == 0)
 536    {
 537       /* Fully anchored, check length */
 538       if (flen != plen)
 539       {
 540          return 1;
 541       }
 542       return simple_domaincmp(pv, fv, plen);
 543    }
 544    else if (unanchored == ANCHOR_RIGHT)
 545    {
 546       /* Left anchored, ignore all extra in fqdn */
 547       return simple_domaincmp(pv, fv, plen);
 548    }
 549    else
 550    {
 551       /* Unanchored */
 552       int n;
 553       int maxn = flen - plen;
 554       for (n = 0; n <= maxn; n++)
 555       {
 556          if (!simple_domaincmp(pv, fv, plen))
 557          {
 558             return 0;
 559          }
 560          /*
 561           * Doesn't match from start of fqdn
 562           * Try skipping first part of fqdn
 563           */
 564          fv++;
 565       }
 566       return 1;
 567    }
 568
 569 }
 570
 571
 572 /*********************************************************************
 573  *
 574  * Function    :  create_url_spec
 575  *
 576  * Description :  Creates a "url_spec" structure from a string.
 577  *                When finished, free with unload_url().
 578  *
 579  * Parameters  :
 580  *          1  :  url = Target url_spec to be filled in.  Will be
 581  *                      zeroed before use.
 582  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 583  *                      contents of this buffer are destroyed by this
 584  *                      function.  If this function succeeds, the
 585  *                      buffer is copied to url->spec.  If this
 586  *                      function fails, the contents of the buffer
 587  *                      are lost forever.
 588  *
 589  * Returns     :  JB_ERR_OK - Success
 590  *                JB_ERR_MEMORY - Out of memory
 591  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 592  *                               written to system log)
 593  *
 594  *********************************************************************/
 595 jb_err create_url_spec(struct url_spec * url, const char * buf)
 596 {
 597    char *p;
 598
 599    assert(url);
 600    assert(buf);
 601
 602    /* Zero memory */
 603    memset(url, '\0', sizeof(*url));
 604
 605    /* save a copy of the orignal specification */
 606    if ((url->spec = strdup(buf)) == NULL)
 607    {
 608       return JB_ERR_MEMORY;
 609    }
 610
 611    if ((p = strchr(buf, '/')))
 612    {
 613       if (NULL == (url->path = strdup(p)))
 614       {
 615          freez(url->spec);
 616          return JB_ERR_MEMORY;
 617       }
 618       url->pathlen = strlen(url->path);
 619       *p = '\0';
 620    }
 621    else
 622    {
 623       url->path    = NULL;
 624       url->pathlen = 0;
 625    }
 626 #ifdef REGEX
 627    if (url->path)
 628    {
 629       int errcode;
 630       char rebuf[BUFFER_SIZE];
 631
 632       if (NULL == (url->preg = zalloc(sizeof(*url->preg))))
 633       {
 634          freez(url->spec);
 635          freez(url->path);
 636          return JB_ERR_MEMORY;
 637       }
 638
 639       sprintf(rebuf, "^(%s)", url->path);
 640
 641       errcode = regcomp(url->preg, rebuf,
 642             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 643       if (errcode)
 644       {
 645          size_t errlen = regerror(errcode,
 646             url->preg, rebuf, sizeof(rebuf));
 647
 648          if (errlen > (sizeof(rebuf) - (size_t)1))
 649          {
 650             errlen = sizeof(rebuf) - (size_t)1;
 651          }
 652          rebuf[errlen] = '\0';
 653
 654          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
 655             url->spec, rebuf);
 656
 657          freez(url->spec);
 658          freez(url->path);
 659          freez(url->preg);
 660
 661          return JB_ERR_PARSE;
 662       }
 663    }
 664 #endif
 665    if ((p = strchr(buf, ':')) == NULL)
 666    {
 667       url->port = 0;
 668    }
 669    else
 670    {
 671       *p++ = '\0';
 672       url->port = atoi(p);
 673    }
 674
 675    if (buf[0] != '\0')
 676    {
 677       char *v[150];
 678       size_t size;
 679
 680       /* Parse domain part */
 681       if (buf[strlen(buf) - 1] == '.')
 682       {
 683          url->unanchored |= ANCHOR_RIGHT;
 684       }
 685       if (buf[0] == '.')
 686       {
 687          url->unanchored |= ANCHOR_LEFT;
 688       }
 689
 690       /* split domain into components */
 691
 692       url->dbuffer = strdup(buf);
 693       if (NULL == url->dbuffer)
 694       {
 695          freez(url->spec);
 696          freez(url->path);
 697 #ifdef REGEX
 698          freez(url->preg);
 699 #endif /* def REGEX */
 700          return JB_ERR_MEMORY;
 701       }
 702
 703       /* map to lower case */
 704       for (p = url->dbuffer; *p ; p++)
 705       {
 706          *p = tolower((int)(unsigned char)*p);
 707       }
 708
 709       /* split the domain name into components */
 710       url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 711
 712       if (url->dcount < 0)
 713       {
 714          freez(url->spec);
 715          freez(url->path);
 716 #ifdef REGEX
 717          freez(url->preg);
 718 #endif /* def REGEX */
 719          freez(url->dbuffer);
 720          url->dcount = 0;
 721          return JB_ERR_MEMORY;
 722       }
 723       else if (url->dcount != 0)
 724       {
 725
 726          /* save a copy of the pointers in dvec */
 727          size = url->dcount * sizeof(*url->dvec);
 728
 729          url->dvec = (char **)malloc(size);
 730          if (NULL == url->dvec)
 731          {
 732             freez(url->spec);
 733             freez(url->path);
 734 #ifdef REGEX
 735             freez(url->preg);
 736 #endif /* def REGEX */
 737             freez(url->dbuffer);
 738             url->dcount = 0;
 739             return JB_ERR_MEMORY;
 740          }
 741
 742          memcpy(url->dvec, v, size);
 743       }
 744    }
 745
 746    return JB_ERR_OK;
 747
 748 }
 749
 750
 751 /*********************************************************************
 752  *
 753  * Function    :  free_url_spec
 754  *
 755  * Description :  Called from the "unloaders".  Freez the url
 756  *                structure elements.
 757  *
 758  * Parameters  :
 759  *          1  :  url = pointer to a url_spec structure.
 760  *
 761  * Returns     :  N/A
 762  *
 763  *********************************************************************/
 764 void free_url_spec(struct url_spec *url)
 765 {
 766    if (url == NULL) return;
 767
 768    freez(url->spec);
 769    freez(url->dbuffer);
 770    freez(url->dvec);
 771    freez(url->path);
 772 #ifdef REGEX
 773    if (url->preg)
 774    {
 775       regfree(url->preg);
 776       freez(url->preg);
 777    }
 778 #endif
 779
 780 }
 781
 782
 783 /*********************************************************************
 784  *
 785  * Function    :  url_match
 786  *
 787  * Description :  Compare a URL against a URL pattern.
 788  *
 789  * Parameters  :
 790  *          1  :  pattern = a URL pattern
 791  *          2  :  url = URL to match
 792  *
 793  * Returns     :  0 iff the URL matches the pattern, else nonzero.
 794  *
 795  *********************************************************************/
 796 int url_match(const struct url_spec *pattern,
 797               const struct http_request *url)
 798 {
 799    return ((pattern->port == 0) || (pattern->port == url->port))
 800        && ((pattern->dbuffer == NULL) || (domain_match(pattern, url) == 0))
 801        && ((pattern->path == NULL) ||
 802 #ifdef REGEX
 803             (regexec(pattern->preg, url->path, 0, NULL, 0) == 0)
 804 #else
 805             (strncmp(pattern->path, url->path, pattern->pathlen) == 0)
 806 #endif
 807       );
 808 }
 809
 810
 811 /*
 812   Local Variables:
 813   tab-width: 3
 814   end:
 815 */