urlmatch.c

   1 const char urlmatch_rcs[] = "$Id: urlmatch.c,v 1.1 2002/01/17 20:53:46 jongfoster Exp $";
   2 /*********************************************************************
   3  *
   4  * File        :  $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
   5  *
   6  * Purpose     :  Declares functions to match URLs against URL
   7  *                patterns.
   8  *
   9  * Copyright   :  Written by and Copyright (C) 2001 the SourceForge
  10  *                IJBSWA team.  http://ijbswa.sourceforge.net
  11  *
  12  *                Based on the Internet Junkbuster originally written
  13  *                by and Copyright (C) 1997 Anonymous Coders and
  14  *                Junkbusters Corporation.  http://www.junkbusters.com
  15  *
  16  *                This program is free software; you can redistribute it
  17  *                and/or modify it under the terms of the GNU General
  18  *                Public License as published by the Free Software
  19  *                Foundation; either version 2 of the License, or (at
  20  *                your option) any later version.
  21  *
  22  *                This program is distributed in the hope that it will
  23  *                be useful, but WITHOUT ANY WARRANTY; without even the
  24  *                implied warranty of MERCHANTABILITY or FITNESS FOR A
  25  *                PARTICULAR PURPOSE.  See the GNU General Public
  26  *                License for more details.
  27  *
  28  *                The GNU General Public License should be included with
  29  *                this file.  If not, you can view it at
  30  *                http://www.gnu.org/copyleft/gpl.html
  31  *                or write to the Free Software Foundation, Inc., 59
  32  *                Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  33  *
  34  * Revisions   :
  35  *    $Log: urlmatch.c,v $
  36  *    Revision 1.1  2002/01/17 20:53:46  jongfoster
  37  *    Moving all our URL and URL pattern parsing code to the same file - it
  38  *    was scattered around in filters.c, loaders.c and parsers.c.
  39  *
  40  *    Providing a single, simple url_match(pattern,url) function - rather than
  41  *    the 3-line match routine which was repeated all over the place.
  42  *
  43  *    Renaming free_url to free_url_spec, since it frees a struct url_spec.
  44  *
  45  *    Providing parse_http_url() so that URLs can be parsed without faking a
  46  *    HTTP request line for parse_http_request() or repeating the parsing
  47  *    code (both of which were techniques that were actually in use).
  48  *
  49  *    Standardizing that struct http_request is used to represent a URL, and
  50  *    struct url_spec is used to represent a URL pattern.  (Before, URLs were
  51  *    represented as seperate variables and a partially-filled-in url_spec).
  52  *
  53  *
  54  *********************************************************************/
  55 \f
  56
  57 #include "config.h"
  58
  59 #ifndef _WIN32
  60 #include <stdio.h>
  61 #include <sys/types.h>
  62 #endif
  63
  64 #include <stdlib.h>
  65 #include <ctype.h>
  66 #include <assert.h>
  67 #include <string.h>
  68
  69 #if !defined(_WIN32) && !defined(__OS2__)
  70 #include <unistd.h>
  71 #endif
  72
  73 #include "project.h"
  74 #include "urlmatch.h"
  75 #include "ssplit.h"
  76 #include "miscutil.h"
  77 #include "errlog.h"
  78
  79 const char urlmatch_h_rcs[] = URLMATCH_H_VERSION;
  80
  81 /* Fix a problem with Solaris.  There should be no effect on other
  82  * platforms.
  83  * Solaris's isspace() is a macro which uses it's argument directly
  84  * as an array index.  Therefore we need to make sure that high-bit
  85  * characters generate +ve values, and ideally we also want to make
  86  * the argument match the declared parameter type of "int".
  87  *
  88  * Why did they write a character function that can't take a simple
  89  * "char" argument?  Doh!
  90  */
  91 #define ijb_isupper(__X) isupper((int)(unsigned char)(__X))
  92 #define ijb_tolower(__X) tolower((int)(unsigned char)(__X))
  93
  94
  95 /*********************************************************************
  96  *
  97  * Function    :  free_http_request
  98  *
  99  * Description :  Freez a http_request structure
 100  *
 101  * Parameters  :
 102  *          1  :  http = points to a http_request structure to free
 103  *
 104  * Returns     :  N/A
 105  *
 106  *********************************************************************/
 107 void free_http_request(struct http_request *http)
 108 {
 109    assert(http);
 110
 111    freez(http->cmd);
 112    freez(http->gpc);
 113    freez(http->host);
 114    freez(http->url);
 115    freez(http->hostport);
 116    freez(http->path);
 117    freez(http->ver);
 118    freez(http->host_ip_addr_str);
 119    freez(http->dbuffer);
 120    freez(http->dvec);
 121    http->dcount = 0;
 122 }
 123
 124
 125 /*********************************************************************
 126  *
 127  * Function    :  parse_http_url
 128  *
 129  * Description :  Parse out the host and port from the URL.  Find the
 130  *                hostname & path, port (if ':'), and/or password (if '@')
 131  *
 132  * Parameters  :
 133  *          1  :  url = URL (or is it URI?) to break down
 134  *          2  :  http = pointer to the http structure to hold elements.
 135  *                       Will be zeroed before use.  Note that this
 136  *                       function sets the http->gpc and http->ver
 137  *                       members to NULL.
 138  *          3  :  csp = Current client state (buffers, headers, etc...)
 139  *
 140  * Returns     :  JB_ERR_OK on success
 141  *                JB_ERR_MEMORY on out of memory
 142  *                JB_ERR_CGI_PARAMS on malformed command/URL
 143  *                                  or >100 domains deep.
 144  *
 145  *********************************************************************/
 146 jb_err parse_http_url(const char * url,
 147                       struct http_request *http,
 148                       struct client_state *csp)
 149 {
 150    /*
 151     * Zero out the results structure
 152     */
 153    memset(http, '\0', sizeof(*http));
 154
 155
 156    /*
 157     * Save our initial URL
 158     */
 159    http->url = strdup(url);
 160    if (http->url == NULL)
 161    {
 162       return JB_ERR_MEMORY;
 163    }
 164
 165
 166    /*
 167     * Split URL into protocol,hostport,path.
 168     */
 169    {
 170       char *buf;
 171       char *url_noproto;
 172       char *url_path;
 173
 174       buf = strdup(url);
 175       if (buf == NULL)
 176       {
 177          return JB_ERR_MEMORY;
 178       }
 179
 180       /* Find the start of the URL in our scratch space */
 181       url_noproto = buf;
 182       if (strncmpic(url_noproto, "http://",  7) == 0)
 183       {
 184          url_noproto += 7;
 185          http->ssl = 0;
 186       }
 187       else if (strncmpic(url_noproto, "https://", 8) == 0)
 188       {
 189          url_noproto += 8;
 190          http->ssl = 1;
 191       }
 192       else
 193       {
 194          http->ssl = 0;
 195       }
 196
 197       url_path = strchr(url_noproto, '/');
 198       if (url_path != NULL)
 199       {
 200          /*
 201           * Got a path.
 202           *
 203           * NOTE: The following line ignores the path for HTTPS URLS.
 204           * This means that you get consistent behaviour if you type a
 205           * https URL in and it's parsed by the function.  (When the
 206           * URL is actually retrieved, SSL hides the path part).
 207           */
 208          http->path = strdup(http->ssl ? "/" : url_path);
 209          *url_path = '\0';
 210          http->hostport = strdup(url_noproto);
 211       }
 212       else
 213       {
 214          /*
 215           * Repair broken HTTP requests that don't contain a path,
 216           * or CONNECT requests
 217           */
 218          http->path = strdup("/");
 219          http->hostport = strdup(url_noproto);
 220       }
 221
 222       free(buf);
 223
 224       if ( (http->path == NULL)
 225         || (http->hostport == NULL))
 226       {
 227          free(buf);
 228          free_http_request(http);
 229          return JB_ERR_MEMORY;
 230       }
 231    }
 232
 233
 234    /*
 235     * Split hostport into user/password (ignored), host, port.
 236     */
 237    {
 238       char *buf;
 239       char *host;
 240       char *port;
 241
 242       buf = strdup(http->hostport);
 243       if (buf == NULL)
 244       {
 245          free_http_request(http);
 246          return JB_ERR_MEMORY;
 247       }
 248
 249       /* check if url contains username and/or password */
 250       host = strchr(buf, '@');
 251       if (host != NULL)
 252       {
 253          /* Contains username/password, skip it and the @ sign. */
 254          host++;
 255       }
 256       else
 257       {
 258          /* No username or password. */
 259          host = buf;
 260       }
 261
 262       /* check if url contains port */
 263       port = strchr(host, ':');
 264       if (port != NULL)
 265       {
 266          /* Contains port */
 267          /* Terminate hostname and point to start of port string */
 268          *port++ = '\0';
 269          http->port = atoi(port);
 270       }
 271       else
 272       {
 273          /* No port specified. */
 274          http->port = (http->ssl ? 143 : 80);
 275       }
 276
 277       http->host = strdup(host);
 278
 279       free(buf);
 280
 281       if (http->host == NULL)
 282       {
 283          free_http_request(http);
 284          return JB_ERR_MEMORY;
 285       }
 286    }
 287
 288
 289    /*
 290     * Split domain name so we can compare it against wildcards
 291     */
 292    {
 293       char *vec[BUFFER_SIZE];
 294       int size;
 295       char *p;
 296
 297       http->dbuffer = strdup(http->host);
 298       if (NULL == http->dbuffer)
 299       {
 300          free_http_request(http);
 301          return JB_ERR_MEMORY;
 302       }
 303
 304       /* map to lower case */
 305       for (p = http->dbuffer; *p ; p++)
 306       {
 307          *p = tolower((int)(unsigned char)*p);
 308       }
 309
 310       /* split the domain name into components */
 311       http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec), 1, 1);
 312
 313       if (http->dcount <= 0)
 314       {
 315          /*
 316           * Error: More than SZ(vec) components in domain
 317           *    or: no components in domain
 318           */
 319          free_http_request(http);
 320          return JB_ERR_PARSE;
 321       }
 322
 323       /* save a copy of the pointers in dvec */
 324       size = http->dcount * sizeof(*http->dvec);
 325
 326       http->dvec = (char **)malloc(size);
 327       if (NULL == http->dvec)
 328       {
 329          free_http_request(http);
 330          return JB_ERR_MEMORY;
 331       }
 332
 333       memcpy(http->dvec, vec, size);
 334    }
 335
 336
 337    return JB_ERR_OK;
 338 }
 339
 340
 341 /*********************************************************************
 342  *
 343  * Function    :  parse_http_request
 344  *
 345  * Description :  Parse out the host and port from the URL.  Find the
 346  *                hostname & path, port (if ':'), and/or password (if '@')
 347  *
 348  * Parameters  :
 349  *          1  :  req = HTTP request line to break down
 350  *          2  :  http = pointer to the http structure to hold elements
 351  *          3  :  csp = Current client state (buffers, headers, etc...)
 352  *
 353  * Returns     :  JB_ERR_OK on success
 354  *                JB_ERR_MEMORY on out of memory
 355  *                JB_ERR_CGI_PARAMS on malformed command/URL
 356  *                                  or >100 domains deep.
 357  *
 358  *********************************************************************/
 359 jb_err parse_http_request(const char *req,
 360                           struct http_request *http,
 361                           struct client_state *csp)
 362 {
 363    char *buf;
 364    char *v[10];
 365    int n;
 366    jb_err err;
 367    int is_connect = 0;
 368
 369    memset(http, '\0', sizeof(*http));
 370
 371    buf = strdup(req);
 372    if (buf == NULL)
 373    {
 374       return JB_ERR_MEMORY;
 375    }
 376
 377    n = ssplit(buf, " \r\n", v, SZ(v), 1, 1);
 378    if (n != 3)
 379    {
 380       free(buf);
 381       return JB_ERR_PARSE;
 382    }
 383
 384    /* this could be a CONNECT request */
 385    if (strcmpic(v[0], "connect") == 0)
 386    {
 387       /* Secure */
 388       is_connect = 1;
 389    }
 390    /* or it could be any other basic HTTP request type */
 391    else if ((0 == strcmpic(v[0], "get"))
 392          || (0 == strcmpic(v[0], "head"))
 393          || (0 == strcmpic(v[0], "post"))
 394          || (0 == strcmpic(v[0], "put"))
 395          || (0 == strcmpic(v[0], "delete"))
 396
 397          /* or a webDAV extension (RFC2518) */
 398          || (0 == strcmpic(v[0], "propfind"))
 399          || (0 == strcmpic(v[0], "proppatch"))
 400          || (0 == strcmpic(v[0], "move"))
 401          || (0 == strcmpic(v[0], "copy"))
 402          || (0 == strcmpic(v[0], "mkcol"))
 403          || (0 == strcmpic(v[0], "lock"))
 404          || (0 == strcmpic(v[0], "unlock"))
 405          )
 406    {
 407       /* Normal */
 408       is_connect = 0;
 409    }
 410    else
 411    {
 412       /* Unknown HTTP method */
 413       free(buf);
 414       return JB_ERR_PARSE;
 415    }
 416
 417    err = parse_http_url(v[1], http, csp);
 418    if (err)
 419    {
 420       free(buf);
 421       return err;
 422    }
 423
 424    /*
 425     * Copy the details into the structure
 426     */
 427    http->ssl = is_connect;
 428    http->cmd = strdup(req);
 429    http->gpc = strdup(v[0]);
 430    http->ver = strdup(v[2]);
 431
 432    if ( (http->cmd == NULL)
 433      || (http->gpc == NULL)
 434      || (http->ver == NULL) )
 435    {
 436       free(buf);
 437       free_http_request(http);
 438       return JB_ERR_MEMORY;
 439    }
 440
 441    return JB_ERR_OK;
 442 }
 443
 444
 445 /*********************************************************************
 446  *
 447  * Function    :  simple_domaincmp
 448  *
 449  * Description :  Domain-wise Compare fqdn's.  The comparison is
 450  *                both left- and right-anchored.  The individual
 451  *                domain names are compared with simplematch().
 452  *                This is only used by domain_match.
 453  *
 454  * Parameters  :
 455  *          1  :  pv = array of patterns to compare
 456  *          2  :  fv = array of domain components to compare
 457  *          3  :  len = length of the arrays (both arrays are the
 458  *                      same length - if they weren't, it couldn't
 459  *                      possibly be a match).
 460  *
 461  * Returns     :  0 => domains are equivalent, else no match.
 462  *
 463  *********************************************************************/
 464 static int simple_domaincmp(char **pv, char **fv, int len)
 465 {
 466    int n;
 467
 468    for (n = 0; n < len; n++)
 469    {
 470       if (simplematch(pv[n], fv[n]))
 471       {
 472          return 1;
 473       }
 474    }
 475
 476    return 0;
 477
 478 }
 479
 480
 481 /*********************************************************************
 482  *
 483  * Function    :  domain_match
 484  *
 485  * Description :  Domain-wise Compare fqdn's. Governed by the bimap in
 486  *                pattern->unachored, the comparison is un-, left-,
 487  *                right-anchored, or both.
 488  *                The individual domain names are compared with
 489  *                simplematch().
 490  *
 491  * Parameters  :
 492  *          1  :  pattern = a domain that may contain a '*' as a wildcard.
 493  *          2  :  fqdn = domain name against which the patterns are compared.
 494  *
 495  * Returns     :  0 => domains are equivalent, else no match.
 496  *
 497  *********************************************************************/
 498 static int domain_match(const struct url_spec *pattern, const struct http_request *fqdn)
 499 {
 500    char **pv, **fv;  /* vectors  */
 501    int    plen, flen;
 502    int unanchored = pattern->unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
 503
 504    plen = pattern->dcount;
 505    flen = fqdn->dcount;
 506
 507    if (flen < plen)
 508    {
 509       /* fqdn is too short to match this pattern */
 510       return 1;
 511    }
 512
 513    pv   = pattern->dvec;
 514    fv   = fqdn->dvec;
 515
 516    if (unanchored == ANCHOR_LEFT)
 517    {
 518       /*
 519        * Right anchored.
 520        *
 521        * Convert this into a fully anchored pattern with
 522        * the fqdn and pattern the same length
 523        */
 524       fv += (flen - plen); /* flen - plen >= 0 due to check above */
 525       return simple_domaincmp(pv, fv, plen);
 526    }
 527    else if (unanchored == 0)
 528    {
 529       /* Fully anchored, check length */
 530       if (flen != plen)
 531       {
 532          return 1;
 533       }
 534       return simple_domaincmp(pv, fv, plen);
 535    }
 536    else if (unanchored == ANCHOR_RIGHT)
 537    {
 538       /* Left anchored, ignore all extra in fqdn */
 539       return simple_domaincmp(pv, fv, plen);
 540    }
 541    else
 542    {
 543       /* Unanchored */
 544       int n;
 545       int maxn = flen - plen;
 546       for (n = 0; n <= maxn; n++)
 547       {
 548          if (!simple_domaincmp(pv, fv, plen))
 549          {
 550             return 0;
 551          }
 552          /*
 553           * Doesn't match from start of fqdn
 554           * Try skipping first part of fqdn
 555           */
 556          fv++;
 557       }
 558       return 1;
 559    }
 560
 561 }
 562
 563
 564 /*********************************************************************
 565  *
 566  * Function    :  create_url_spec
 567  *
 568  * Description :  Creates a "url_spec" structure from a string.
 569  *                When finished, free with unload_url().
 570  *
 571  * Parameters  :
 572  *          1  :  url = Target url_spec to be filled in.  Will be
 573  *                      zeroed before use.
 574  *          2  :  buf = Source pattern, null terminated.  NOTE: The
 575  *                      contents of this buffer are destroyed by this
 576  *                      function.  If this function succeeds, the
 577  *                      buffer is copied to url->spec.  If this
 578  *                      function fails, the contents of the buffer
 579  *                      are lost forever.
 580  *
 581  * Returns     :  JB_ERR_OK - Success
 582  *                JB_ERR_MEMORY - Out of memory
 583  *                JB_ERR_PARSE - Cannot parse regex (Detailed message
 584  *                               written to system log)
 585  *
 586  *********************************************************************/
 587 jb_err create_url_spec(struct url_spec * url, const char * buf)
 588 {
 589    char *p;
 590
 591    assert(url);
 592    assert(buf);
 593
 594    /* Zero memory */
 595    memset(url, '\0', sizeof(*url));
 596
 597    /* save a copy of the orignal specification */
 598    if ((url->spec = strdup(buf)) == NULL)
 599    {
 600       return JB_ERR_MEMORY;
 601    }
 602
 603    if ((p = strchr(buf, '/')))
 604    {
 605       if (NULL == (url->path = strdup(p)))
 606       {
 607          freez(url->spec);
 608          return JB_ERR_MEMORY;
 609       }
 610       url->pathlen = strlen(url->path);
 611       *p = '\0';
 612    }
 613    else
 614    {
 615       url->path    = NULL;
 616       url->pathlen = 0;
 617    }
 618 #ifdef REGEX
 619    if (url->path)
 620    {
 621       int errcode;
 622       char rebuf[BUFFER_SIZE];
 623
 624       if (NULL == (url->preg = zalloc(sizeof(*url->preg))))
 625       {
 626          freez(url->spec);
 627          freez(url->path);
 628          return JB_ERR_MEMORY;
 629       }
 630
 631       sprintf(rebuf, "^(%s)", url->path);
 632
 633       errcode = regcomp(url->preg, rebuf,
 634             (REG_EXTENDED|REG_NOSUB|REG_ICASE));
 635       if (errcode)
 636       {
 637          size_t errlen = regerror(errcode,
 638             url->preg, rebuf, sizeof(rebuf));
 639
 640          if (errlen > (sizeof(rebuf) - (size_t)1))
 641          {
 642             errlen = sizeof(rebuf) - (size_t)1;
 643          }
 644          rebuf[errlen] = '\0';
 645
 646          log_error(LOG_LEVEL_ERROR, "error compiling %s: %s",
 647             url->spec, rebuf);
 648
 649          freez(url->spec);
 650          freez(url->path);
 651          freez(url->preg);
 652
 653          return JB_ERR_PARSE;
 654       }
 655    }
 656 #endif
 657    if ((p = strchr(buf, ':')) == NULL)
 658    {
 659       url->port = 0;
 660    }
 661    else
 662    {
 663       *p++ = '\0';
 664       url->port = atoi(p);
 665    }
 666
 667    if (buf[0] != '\0')
 668    {
 669       char *v[150];
 670       int size;
 671
 672       /* Parse domain part */
 673       if (buf[strlen(buf) - 1] == '.')
 674       {
 675          url->unanchored |= ANCHOR_RIGHT;
 676       }
 677       if (buf[0] == '.')
 678       {
 679          url->unanchored |= ANCHOR_LEFT;
 680       }
 681
 682       /* split domain into components */
 683
 684       url->dbuffer = strdup(buf);
 685       if (NULL == url->dbuffer)
 686       {
 687          freez(url->spec);
 688          freez(url->path);
 689 #ifdef REGEX
 690          freez(url->preg);
 691 #endif /* def REGEX */
 692          return JB_ERR_MEMORY;
 693       }
 694
 695       /* map to lower case */
 696       for (p = url->dbuffer; *p ; p++)
 697       {
 698          *p = tolower((int)(unsigned char)*p);
 699       }
 700
 701       /* split the domain name into components */
 702       url->dcount = ssplit(url->dbuffer, ".", v, SZ(v), 1, 1);
 703
 704       if (url->dcount < 0)
 705       {
 706          freez(url->spec);
 707          freez(url->path);
 708 #ifdef REGEX
 709          freez(url->preg);
 710 #endif /* def REGEX */
 711          freez(url->dbuffer);
 712          url->dcount = 0;
 713          return JB_ERR_MEMORY;
 714       }
 715       else if (url->dcount != 0)
 716       {
 717
 718          /* save a copy of the pointers in dvec */
 719          size = url->dcount * sizeof(*url->dvec);
 720
 721          url->dvec = (char **)malloc(size);
 722          if (NULL == url->dvec)
 723          {
 724             freez(url->spec);
 725             freez(url->path);
 726 #ifdef REGEX
 727             freez(url->preg);
 728 #endif /* def REGEX */
 729             freez(url->dbuffer);
 730             url->dcount = 0;
 731             return JB_ERR_MEMORY;
 732          }
 733
 734          memcpy(url->dvec, v, size);
 735       }
 736    }
 737
 738    return JB_ERR_OK;
 739
 740 }
 741
 742
 743 /*********************************************************************
 744  *
 745  * Function    :  free_url_spec
 746  *
 747  * Description :  Called from the "unloaders".  Freez the url
 748  *                structure elements.
 749  *
 750  * Parameters  :
 751  *          1  :  url = pointer to a url_spec structure.
 752  *
 753  * Returns     :  N/A
 754  *
 755  *********************************************************************/
 756 void free_url_spec(struct url_spec *url)
 757 {
 758    if (url == NULL) return;
 759
 760    freez(url->spec);
 761    freez(url->dbuffer);
 762    freez(url->dvec);
 763    freez(url->path);
 764 #ifdef REGEX
 765    if (url->preg)
 766    {
 767       regfree(url->preg);
 768       freez(url->preg);
 769    }
 770 #endif
 771
 772 }
 773
 774
 775 /*********************************************************************
 776  *
 777  * Function    :  url_match
 778  *
 779  * Description :  Compare a URL against a URL pattern.
 780  *
 781  * Parameters  :
 782  *          1  :  pattern = a URL pattern
 783  *          2  :  url = URL to match
 784  *
 785  * Returns     :  0 iff the URL matches the pattern, else nonzero.
 786  *
 787  *********************************************************************/
 788 int url_match(const struct url_spec *pattern,
 789               const struct http_request *url)
 790 {
 791    return ((pattern->port == 0) || (pattern->port == url->port))
 792        && ((pattern->dbuffer == NULL) || (domain_match(pattern, url) == 0))
 793        && ((pattern->path == NULL) ||
 794 #ifdef REGEX
 795             (regexec(pattern->preg, url->path, 0, NULL, 0) == 0)
 796 #else
 797             (strncmp(pattern->path, url->path, pattern->pathlen) == 0)
 798 #endif
 799       );
 800 }
 801
 802
 803 /*
 804   Local Variables:
 805   tab-width: 3
 806   end:
 807 */