urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.2 2002/01/21 00:14:09 jongfoster Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001 the SourceForge
  10  *                IJBSWA team.  http://ijbswa.sourceforge.net
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.2  2002/01/21 00:14:09  jongfoster
  37  *    Correcting comment style
  38  *    Fixing an uninitialized memory bug in create_url_spec()
  39  *
  40  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
  41  *    Moving all our URL and URL pattern parsing code to the same file - it
  42  *    was scattered around in filters.c, loaders.c and parsers.c.
  43  *
  44  *    Providing a single, simple url_match(pattern,url) function - rather than
  45  *    the 3-line match routine which was repeated all over the place.
  46  *
  47  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
  48  *
  49  *    Providing parse_http_url() so that URLs can be parsed without faking a
  50  *    HTTP request line for parse_http_request() or repeating the parsing
  51  *    code (both of which were techniques that were actually in use).
  52  *
  53  *    Standardizing that struct http_request is used to represent a URL, and
  54  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
  55  *    represented as seperate variables and a partially-filled-in url_spec).
  56  *
  57  *
  58  *********************************************************************/
  59 \f
  60
  61 #include "config.h"
  62
  63 #ifndef _WIN32
  64 #include <stdio.h>
  65 #include <sys/types.h>
  66 #endif
  67
  68 #include <stdlib.h>
  69 #include <ctype.h>
  70 #include <assert.h>
  71 #include <string.h>
  72
  73 #if !defined(_WIN32) && !defined(__OS2__)
  74 #include <unistd.h>
  75 #endif
  76
  77 #include "project.h"
  78 #include "urlmatch.h"
  79 #include "ssplit.h"
  80 #include "miscutil.h"
  81 #include "errlog.h"
  82
  83 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  84
  85 /* Fix a problem with Solaris.  There should be no effect on other
  86  * platforms.
  87  * Solaris's isspace() is a macro which uses it's argument directly
  88  * as an array index.  Therefore we need to make sure that high-bit
  89  * characters generate +ve values, and ideally we also want to make
  90  * the argument match the declared parameter type of "int".
  91  *
  92  * Why did they write a character function that can't take a simple
  93  * "char" argument?  Doh!
  94  */
  95 #define ijb_isupper(__X) isupper((int)(unsigned char)(__X))
  96 #define ijb_tolower(__X) tolower((int)(unsigned char)(__X))
  97
  98
  99 /*********************************************************************
 100  *
 101  * Function    :  free_http_request
 102  *
 103  * Description :  Freez a http_request structure
 104  *
 105  * Parameters  :
 106  *          1  :  http = points to a http_request structure to free
 107  *
 108  * Returns     :  N/A
 109  *
 110  *********************************************************************/
 111 void free_http_request(struct http_request *http)
 112 {
 113    assert(http);
 114
 115    freez(http->cmd);
 116    freez(http->ocmd);
 117    freez(http->gpc);
 118    freez(http->host);
 119    freez(http->url);
 120    freez(http->hostport);
 121    freez(http->path);
 122    freez(http->ver);
 123    freez(http->host_ip_addr_str);
 124    freez(http->dbuffer);
 125    freez(http->dvec);
 126    http->dcount = 0;
 127 }
 128
 129
 130 /*********************************************************************
 131  *
 132  * Function    :  parse_http_url
 133  *
 134  * Description :  Parse out the host and port from the URL.  Find the
 135  *                hostname & path, port (if ':'), and/or password (if '@')
 136  *
 137  * Parameters  :
 138  *          1  :  url = URL (or is it URI?) to break down
 139  *          2  :  http = pointer to the http structure to hold elements.
 140  *                       Will be zeroed before use.  Note that this
 141  *                       function sets the http->gpc and http->ver
 142  *                       members to NULL.
 143  *          3  :  csp = Current client state (buffers, headers, etc...)
 144  *
 145  * Returns     :  JB_ERR_OK on success
 146  *                JB_ERR_MEMORY on out of memory
 147  *                JB_ERR_CGI_PARAMS on malformed command/URL
 148  *                                  or >100 domains deep.
 149  *
 150  *********************************************************************/
 151 jb_err parse_http_url(const char * url,
 152                       struct http_request *http,
 153                       struct client_state *csp)
 154 {
 155    /*
 156     * Zero out the results structure
 157     */
 158    memset(http, '\0', sizeof(*http));
 159
 160
 161    /*
 162     * Save our initial URL
 163     */
 164    http->url = strdup(url);
 165    if (http->url == NULL)
 166    {
 167       return JB_ERR_MEMORY;
 168    }
 169
 170
 171    /*
 172     * Split URL into protocol,hostport,path.
 173     */
 174    {
 175       char *buf;
 176       char *url_noproto;
 177       char *url_path;
 178
 179       buf = strdup(url);
 180       if (buf == NULL)
 181       {
 182          return JB_ERR_MEMORY;
 183       }
 184
 185       /* Find the start of the URL in our scratch space */
 186       url_noproto = buf;
 187       if (strncmpic(url_noproto, "http://",  7) == 0)
 188       {
 189          url_noproto += 7;
 190          http->ssl = 0;
 191       }
 192       else if (strncmpic(url_noproto, "https://", 8) == 0)
 193       {
 194          url_noproto += 8;
 195          http->ssl = 1;
 196       }
 197       else
 198       {
 199          http->ssl = 0;
 200       }
 201
 202       url_path = strchr(url_noproto, '/');
 203       if (url_path != NULL)
 204       {
 205          /*
 206           * Got a path.
 207           *
 208           * NOTE: The following line ignores the path for HTTPS URLS.
 209           * This means that you get consistent behaviour if you type a
 210           * https URL in and it's parsed by the function.  (When the
 211           * URL is actually retrieved, SSL hides the path part).
 212           */
 213          http->path = strdup(http->ssl ? "/" : url_path);
 214          *url_path = '\0';
 215          http->hostport = strdup(url_noproto);
 216       }
 217       else
 218       {
 219          /*
 220           * Repair broken HTTP requests that don't contain a path,
 221           * or CONNECT requests
 222           */
 223          http->path = strdup("/");
 224          http->hostport = strdup(url_noproto);
 225       }
 226
 227       free(buf);
 228
 229       if ( (http->path == NULL)
 230         || (http->hostport == NULL))
 231       {
 232          free(buf);
 233          free_http_request(http);
 234          return JB_ERR_MEMORY;
 235       }
 236    }
 237
 238
 239    /*
 240     * Split hostport into user/password (ignored), host, port.
 241     */
 242    {
 243       char *buf;
 244       char *host;
 245       char *port;
 246
 247       buf = strdup(http->hostport);
 248       if (buf == NULL)
 249       {
 250          free_http_request(http);
 251          return JB_ERR_MEMORY;
 252       }
 253
 254       /* check if url contains username and/or password */
 255       host = strchr(buf, '@');
 256       if (host != NULL)
 257       {
 258          /* Contains username/password, skip it and the @ sign. */
 259          host++;
 260       }
 261       else
 262       {
 263          /* No username or password. */
 264          host = buf;
 265       }
 266
 267       /* check if url contains port */
 268       port = strchr(host, ':');
 269       if (port != NULL)
 270       {
 271          /* Contains port */
 272          /* Terminate hostname and point to start of port string */
 273          *port++ = '\0';
 274          http->port = atoi(port);
 275       }
 276       else
 277       {
 278          /* No port specified. */
 279          http->port = (http->ssl ? 143 : 80);
 280       }
 281
 282       http->host = strdup(host);
 283
 284       free(buf);
 285
 286       if (http->host == NULL)
 287       {
 288          free_http_request(http);
 289          return JB_ERR_MEMORY;
 290       }
 291    }
 292
 293
 294    /*
 295     * Split domain name so we can compare it against wildcards
 296     */
 297    {
 298       char *vec[BUFFER_SIZE];
 299       int size;
 300       char *p;
 301
 302       http->dbuffer = strdup(http->host);
 303       if (NULL == http->dbuffer)
 304       {
 305          free_http_request(http);
 306          return JB_ERR_MEMORY;
 307       }
 308
 309       /* map to lower case */
 310       for (p = http->dbuffer; *p ; p++)
 311       {
 312          *p = tolower((int)(unsigned char)*p);
 313       }
 314
 315       /* split the domain name into components */
 316       http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 317
 318       if (http->dcount <= 0)
 319       {
 320          /*
 321           * Error: More than SZ(vec) components in domain
 322           *    or: no components in domain
 323           */
 324          free_http_request(http);
 325          return JB_ERR_PARSE;
 326       }
 327
 328       /* save a copy of the pointers in dvec */
 329       size = http->dcount * sizeof(*http->dvec);
 330
 331       http->dvec = (char **)malloc(size);
 332       if (NULL == http->dvec)
 333       {
 334          free_http_request(http);
 335          return JB_ERR_MEMORY;
 336       }
 337
 338       memcpy(http->dvec, vec, size);
 339    }
 340
 341
 342    return JB_ERR_OK;
 343 }
 344
 345
 346 /*********************************************************************
 347  *
 348  * Function    :  parse_http_request
 349  *
 350  * Description :  Parse out the host and port from the URL.  Find the
 351  *                hostname & path, port (if ':'), and/or password (if '@')
 352  *
 353  * Parameters  :
 354  *          1  :  req = HTTP request line to break down
 355  *          2  :  http = pointer to the http structure to hold elements
 356  *          3  :  csp = Current client state (buffers, headers, etc...)
 357  *
 358  * Returns     :  JB_ERR_OK on success
 359  *                JB_ERR_MEMORY on out of memory
 360  *                JB_ERR_CGI_PARAMS on malformed command/URL
 361  *                                  or >100 domains deep.
 362  *
 363  *********************************************************************/
 364 jb_err parse_http_request(const char *req,
 365                           struct http_request *http,
 366                           struct client_state *csp)
 367 {
 368    char *buf;
 369    char *v[10];
 370    int n;
 371    jb_err err;
 372    int is_connect = 0;
 373
 374    memset(http, '\0', sizeof(*http));
 375
 376    buf = strdup(req);
 377    if (buf == NULL)
 378    {
 379       return JB_ERR_MEMORY;
 380    }
 381
 382    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 383    if (n != 3)
 384    {
 385       free(buf);
 386       return JB_ERR_PARSE;
 387    }
 388
 389    /* this could be a CONNECT request */
 390    if (strcmpic(v[0], "connect") == 0)
 391    {
 392       /* Secure */
 393       is_connect = 1;
 394    }
 395    /* or it could be any other basic HTTP request type */
 396    else if ((0 == strcmpic(v[0], "get"))
 397          || (0 == strcmpic(v[0], "head"))
 398          || (0 == strcmpic(v[0], "post"))
 399          || (0 == strcmpic(v[0], "put"))
 400          || (0 == strcmpic(v[0], "delete"))
 401
 402          /* or a webDAV extension (RFC2518) */
 403          || (0 == strcmpic(v[0], "propfind"))
 404          || (0 == strcmpic(v[0], "proppatch"))
 405          || (0 == strcmpic(v[0], "move"))
 406          || (0 == strcmpic(v[0], "copy"))
 407          || (0 == strcmpic(v[0], "mkcol"))
 408          || (0 == strcmpic(v[0], "lock"))
 409          || (0 == strcmpic(v[0], "unlock"))
 410          )
 411    {
 412       /* Normal */
 413       is_connect = 0;
 414    }
 415    else
 416    {
 417       /* Unknown HTTP method */
 418       free(buf);
 419       return JB_ERR_PARSE;
 420    }
 421
 422    err = parse_http_url(v[1], http, csp);
 423    if (err)
 424    {
 425       free(buf);
 426       return err;
 427    }
 428
 429    /*
 430     * Copy the details into the structure
 431     */
 432    http->ssl = is_connect;
 433    http->cmd = strdup(req);
 434    http->gpc = strdup(v[0]);
 435    http->ver = strdup(v[2]);
 436
 437    if ( (http->cmd == NULL)
 438      || (http->gpc == NULL)
 439      || (http->ver == NULL) )
 440    {
 441       free(buf);
 442       free_http_request(http);
 443       return JB_ERR_MEMORY;
 444    }
 445
 446    return JB_ERR_OK;
 447 }
 448
 449
 450 /*********************************************************************
 451  *
 452  * Function    :  simple_domaincmp
 453  *
 454  * Description :  Domain-wise Compare fqdn's.  The comparison is
 455  *                both left- and right-anchored.  The individual
 456  *                domain names are compared with simplematch().
 457  *                This is only used by domain_match.
 458  *
 459  * Parameters  :
 460  *          1  :  pv = array of patterns to compare
 461  *          2  :  fv = array of domain components to compare
 462  *          3  :  len = length of the arrays (both arrays are the
 463  *                      same length - if they weren't, it couldn't
 464  *                      possibly be a match).
 465  *
 466  * Returns     :  0 => domains are equivalent, else no match.
 467  *
 468  *********************************************************************/
 469 static int simple_domaincmp(char **pv, char **fv, int len)
 470 {
 471    int n;
 472
 473    for (n = 0; n < len; n++)
 474    {
 475       if (simplematch(pv[n], fv[n]))
 476       {
 477          return 1;
 478       }
 479    }
 480
 481    return 0;
 482
 483 }
 484
 485
 486 /*********************************************************************
 487  *
 488  * Function    :  domain_match
 489  *
 490  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 491  *                pattern->unachored, the comparison is un-, left-,
 492  *                right-anchored, or both.
 493  *                The individual domain names are compared with
 494  *                simplematch().
 495  *
 496  * Parameters  :
 497  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 498  *          2  :  fqdn = domain name against which the patterns are compared.
 499  *
 500  * Returns     :  0 => domains are equivalent, else no match.
 501  *
 502  *********************************************************************/
 503 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 504 {
 505    char **pv, **fv;  /* vectors  */
 506    int    plen, flen;
 507    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 508
 509    plen = pattern->dcount;
 510    flen = fqdn->dcount;
 511
 512    if (flen < plen)
 513    {
 514       /* fqdn is too short to match this pattern */
 515       return 1;
 516    }
 517
 518    pv   = pattern->dvec;
 519    fv   = fqdn->dvec;
 520
 521    if (unanchored == ANCHOR_LEFT)
 522    {
 523       /*
 524        * Right anchored.
 525        *
 526        * Convert this into a fully anchored pattern with
 527        * the fqdn and pattern the same length
 528        */
 529       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 530       return simple_domaincmp(pv, fv, plen);
 531    }
 532    else if (unanchored == 0)
 533    {
 534       /* Fully anchored, check length */
 535       if (flen != plen)
 536       {
 537          return 1;
 538       }
 539       return simple_domaincmp(pv, fv, plen);
 540    }
 541    else if (unanchored == ANCHOR_RIGHT)
 542    {
 543       /* Left anchored, ignore all extra in fqdn */
 544       return simple_domaincmp(pv, fv, plen);
 545    }
 546    else
 547    {
 548       /* Unanchored */
 549       int n;
 550       int maxn = flen - plen;
 551       for (n = 0; n <= maxn; n++)
 552       {
 553          if (!simple_domaincmp(pv, fv, plen))
 554          {
 555             return 0;
 556          }
 557          /*
 558           * Doesn't match from start of fqdn
 559           * Try skipping first part of fqdn
 560           */
 561          fv++;
 562       }
 563       return 1;
 564    }
 565
 566 }
 567
 568
 569 /*********************************************************************
 570  *
 571  * Function    :  create_url_spec
 572  *
 573  * Description :  Creates a "url_spec" structure from a string.
 574  *                When finished, free with unload_url().
 575  *
 576  * Parameters  :
 577  *          1  :  url = Target url_spec to be filled in.  Will be
 578  *                      zeroed before use.
 579  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 580  *                      contents of this buffer are destroyed by this
 581  *                      function.  If this function succeeds, the
 582  *                      buffer is copied to url->spec.  If this
 583  *                      function fails, the contents of the buffer
 584  *                      are lost forever.
 585  *
 586  * Returns     :  JB_ERR_OK - Success
 587  *                JB_ERR_MEMORY - Out of memory
 588  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 589  *                               written to system log)
 590  *
 591  *********************************************************************/
 592 jb_err create_url_spec(struct url_spec * url, const char * buf)
 593 {
 594    char *p;
 595
 596    assert(url);
 597    assert(buf);
 598
 599    /* Zero memory */
 600    memset(url, '\0', sizeof(*url));
 601
 602    /* save a copy of the orignal specification */
 603    if ((url->spec = strdup(buf)) == NULL)
 604    {
 605       return JB_ERR_MEMORY;
 606    }
 607
 608    if ((p = strchr(buf, '/')))
 609    {
 610       if (NULL == (url->path = strdup(p)))
 611       {
 612          freez(url->spec);
 613          return JB_ERR_MEMORY;
 614       }
 615       url->pathlen = strlen(url->path);
 616       *p = '\0';
 617    }
 618    else
 619    {
 620       url->path    = NULL;
 621       url->pathlen = 0;
 622    }
 623 #ifdef REGEX
 624    if (url->path)
 625    {
 626       int errcode;
 627       char rebuf[BUFFER_SIZE];
 628
 629       if (NULL == (url->preg = zalloc(sizeof(*url->preg))))
 630       {
 631          freez(url->spec);
 632          freez(url->path);
 633          return JB_ERR_MEMORY;
 634       }
 635
 636       sprintf(rebuf, "^(%s)", url->path);
 637
 638       errcode = regcomp(url->preg, rebuf,
 639             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 640       if (errcode)
 641       {
 642          size_t errlen = regerror(errcode,
 643             url->preg, rebuf, sizeof(rebuf));
 644
 645          if (errlen > (sizeof(rebuf) - (size_t)1))
 646          {
 647             errlen = sizeof(rebuf) - (size_t)1;
 648          }
 649          rebuf[errlen] = '\0';
 650
 651          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
 652             url->spec, rebuf);
 653
 654          freez(url->spec);
 655          freez(url->path);
 656          freez(url->preg);
 657
 658          return JB_ERR_PARSE;
 659       }
 660    }
 661 #endif
 662    if ((p = strchr(buf, ':')) == NULL)
 663    {
 664       url->port = 0;
 665    }
 666    else
 667    {
 668       *p++ = '\0';
 669       url->port = atoi(p);
 670    }
 671
 672    if (buf[0] != '\0')
 673    {
 674       char *v[150];
 675       int size;
 676
 677       /* Parse domain part */
 678       if (buf[strlen(buf) - 1] == '.')
 679       {
 680          url->unanchored |= ANCHOR_RIGHT;
 681       }
 682       if (buf[0] == '.')
 683       {
 684          url->unanchored |= ANCHOR_LEFT;
 685       }
 686
 687       /* split domain into components */
 688
 689       url->dbuffer = strdup(buf);
 690       if (NULL == url->dbuffer)
 691       {
 692          freez(url->spec);
 693          freez(url->path);
 694 #ifdef REGEX
 695          freez(url->preg);
 696 #endif /* def REGEX */
 697          return JB_ERR_MEMORY;
 698       }
 699
 700       /* map to lower case */
 701       for (p = url->dbuffer; *p ; p++)
 702       {
 703          *p = tolower((int)(unsigned char)*p);
 704       }
 705
 706       /* split the domain name into components */
 707       url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 708
 709       if (url->dcount < 0)
 710       {
 711          freez(url->spec);
 712          freez(url->path);
 713 #ifdef REGEX
 714          freez(url->preg);
 715 #endif /* def REGEX */
 716          freez(url->dbuffer);
 717          url->dcount = 0;
 718          return JB_ERR_MEMORY;
 719       }
 720       else if (url->dcount != 0)
 721       {
 722
 723          /* save a copy of the pointers in dvec */
 724          size = url->dcount * sizeof(*url->dvec);
 725
 726          url->dvec = (char **)malloc(size);
 727          if (NULL == url->dvec)
 728          {
 729             freez(url->spec);
 730             freez(url->path);
 731 #ifdef REGEX
 732             freez(url->preg);
 733 #endif /* def REGEX */
 734             freez(url->dbuffer);
 735             url->dcount = 0;
 736             return JB_ERR_MEMORY;
 737          }
 738
 739          memcpy(url->dvec, v, size);
 740       }
 741    }
 742
 743    return JB_ERR_OK;
 744
 745 }
 746
 747
 748 /*********************************************************************
 749  *
 750  * Function    :  free_url_spec
 751  *
 752  * Description :  Called from the "unloaders".  Freez the url
 753  *                structure elements.
 754  *
 755  * Parameters  :
 756  *          1  :  url = pointer to a url_spec structure.
 757  *
 758  * Returns     :  N/A
 759  *
 760  *********************************************************************/
 761 void free_url_spec(struct url_spec *url)
 762 {
 763    if (url == NULL) return;
 764
 765    freez(url->spec);
 766    freez(url->dbuffer);
 767    freez(url->dvec);
 768    freez(url->path);
 769 #ifdef REGEX
 770    if (url->preg)
 771    {
 772       regfree(url->preg);
 773       freez(url->preg);
 774    }
 775 #endif
 776
 777 }
 778
 779
 780 /*********************************************************************
 781  *
 782  * Function    :  url_match
 783  *
 784  * Description :  Compare a URL against a URL pattern.
 785  *
 786  * Parameters  :
 787  *          1  :  pattern = a URL pattern
 788  *          2  :  url = URL to match
 789  *
 790  * Returns     :  0 iff the URL matches the pattern, else nonzero.
 791  *
 792  *********************************************************************/
 793 int url_match(const struct url_spec *pattern,
 794               const struct http_request *url)
 795 {
 796    return ((pattern->port == 0) || (pattern->port == url->port))
 797        && ((pattern->dbuffer == NULL) || (domain_match(pattern, url) == 0))
 798        && ((pattern->path == NULL) ||
 799 #ifdef REGEX
 800             (regexec(pattern->preg, url->path, 0, NULL, 0) == 0)
 801 #else
 802             (strncmp(pattern->path, url->path, pattern->pathlen) == 0)
 803 #endif
 804       );
 805 }
 806
 807
 808 /*
 809   Local Variables:
 810   tab-width: 3
 811   end:
 812 */