X-Git-Url: http://www.privoxy.org/gitweb/?p=privoxy.git;a=blobdiff_plain;f=pcrs.c;h=48c78fdfd385bfb1e5b2e98f325aa9dbca06486b;hp=1ad3448a605cb7d6ecc02555a0485335df8f65cf;hb=ffa11e6cc708137ca152a78fcfa4bdd03bfa81c1;hpb=7043fa1105a7fa5f7dbcf9824971f47fd88d3b71 diff --git a/pcrs.c b/pcrs.c index 1ad3448a..48c78fdf 100644 --- a/pcrs.c +++ b/pcrs.c @@ -1,26 +1,21 @@ -const char pcrs_rcs[] = "$Id: pcrs.c,v 1.2 2001/05/22 18:46:04 oes Exp $"; +const char pcrs_rcs[] = "$Id: pcrs.c,v 1.8 2001/06/29 21:45:41 oes Exp $"; /********************************************************************* * * File : $Source: /cvsroot/ijbswa/current/pcrs.c,v $ * - * Purpose : This is the alpha release of libpcrs. It is only published - * at this early stage of development, because it is - * needed for a new feature in JunkBuster. - * - * While no inconsistencies, memory leaks or functional bugs - * are known at this time, there *could* be plenty ;-). Also, - * Many pcre-specific options are not yet supported, and - * error handling needs improvement. - * - * pcrs is a supplement to the brilliant pcre library by Philip + * Purpose : pcrs is a supplement to the brilliant pcre library by Philip * Hazel (ph10@cam.ac.uk) and adds Perl-style substitution. That * is, it mimics Perl's 's' operator. * * Currently, there's no documentation besides comments and the * source itself ;-) * - * Copyright : Written and Copyright (C) 2000 by Andreas Oesterhelt + * Note: In addition to perl's options, 'U' for ungreedy and 'T' + * for trivial (i.e.: ignore backrefs in the substitute) are + * supported. + * + * Copyright : Written and Copyright (C) 2000, 2001 by Andreas S. Oesterhelt * * * This program is free software; you can redistribute it @@ -43,6 +38,63 @@ const char pcrs_rcs[] = "$Id: pcrs.c,v 1.2 2001/05/22 18:46:04 oes Exp $"; * * Revisions : * $Log: pcrs.c,v $ + * Revision 1.8 2001/06/29 21:45:41 oes + * Indentation, CRLF->LF, Tab-> Space + * + * Revision 1.7 2001/06/29 13:33:04 oes + * - Cleaned up, renamed and reordered functions, + * improved comments + * - Removed my_strsep + * - Replaced globalflag with a general flags int + * that holds PCRS_GLOBAL, PCRS_SUCCESS, and PCRS_TRIVIAL + * - Introduced trivial option that will prevent pcrs + * from honouring backreferences in the substitute, + * which is useful for large substitutes that are + * red in from somewhere and saves the pain of escaping + * the backrefs + * - Introduced convenience function pcrs_free_joblist() + * - Split pcrs_make_job() into pcrs_compile(), which still + * takes a complete s/// comand as argument and parses it, + * and a new function pcrs_make_job, which takes the + * three separate components. This should make for a + * much friendlier frontend. + * - Removed create_pcrs_job() which was useless + * - Fixed a bug in pcrs_execute + * - Success flag is now handled by pcrs instead of user + * - Removed logentry from cancelled commit + * + * Revision 1.6 2001/06/03 19:12:45 oes + * added FIXME + * + * Revision 1.5 2001/05/29 09:50:24 jongfoster + * Unified blocklist/imagelist/permissionslist. + * File format is still under discussion, but the internal changes + * are (mostly) done. + * + * Also modified interceptor behaviour: + * - We now intercept all URLs beginning with one of the following + * prefixes (and *only* these prefixes): + * * http://i.j.b/ + * * http://ijbswa.sf.net/config/ + * * http://ijbswa.sourceforge.net/config/ + * - New interceptors "home page" - go to http://i.j.b/ to see it. + * - Internal changes so that intercepted and fast redirect pages + * are not replaced with an image. + * - Interceptors now have the option to send a binary page direct + * to the client. (i.e. ijb-send-banner uses this) + * - Implemented show-url-info interceptor. (Which is why I needed + * the above interceptors changes - a typical URL is + * "http://i.j.b/show-url-info?url=www.somesite.com/banner.gif". + * The previous mechanism would not have intercepted that, and + * if it had been intercepted then it then it would have replaced + * it with an image.) + * + * Revision 1.4 2001/05/25 14:12:40 oes + * Fixed bug: Empty substitutes now detected + * + * Revision 1.3 2001/05/25 11:03:55 oes + * Added sanity check for NULL jobs to pcrs_exec_substitution + * * Revision 1.2 2001/05/22 18:46:04 oes * * - Enabled filtering banners by size rather than URL @@ -100,59 +152,6 @@ const char pcrs_rcs[] = "$Id: pcrs.c,v 1.2 2001/05/22 18:46:04 oes Exp $"; const char pcrs_h_rcs[] = PCRS_H_VERSION; -/********************************************************************* - * - * Function : my_strsep - * - * Description : Convenience function. It acts like strsep, except that - * it respects quoting of the delimiter character with the - * quote character. (And, of course, quoting the quote char - * with itself.) Called from `pcrs_make_job'. - * - * Parameters : - * 1 : token = current token - * 2 : text = string to tokenize - * 3 : delimiter = single character deliminter - * 4 : quote_char = character to cause quoting - * - * Returns : -1 => failure, else the length of the token found. - * In the latter case, *text is the token's start. - * - *********************************************************************/ -int my_strsep(char *token, char **text, char delimiter, char quote_char) -{ - int i, k=0, limit, quoted = FALSE; - - limit = strlen(*text); - if ( 0 == limit ) - { - return -1; - } - - token[0] = '\0'; - - for (i=0; i < limit; i++) - { - if (text[0][i] == delimiter && !quoted) - { - *text += 1; - break; - } - else if (text[0][i] == quote_char && !quoted && i+1 < limit && text[0][i+1] == delimiter) - { - quoted = TRUE; - continue; - } - token[k++] = text[0][i]; - quoted = FALSE; - } - token[k] = '\0'; - *text += i; - return k; - -} - - /********************************************************************* * * Function : pcrs_compile_perl_options @@ -160,33 +159,36 @@ int my_strsep(char *token, char **text, char delimiter, char quote_char) * Description : This function parses a string containing the options to * Perl's s/// operator. It returns an integer that is the * pcre equivalent of the symbolic optstring. - * Since pcre doesn't know about Perl's 'g' (global) option, - * but pcrs needs it, the globalflag integer is set if 'g' - * is encountered. - * + * Since pcre doesn't know about Perl's 'g' (global) or pcrs', + * 'T' (trivial) options but pcrs needs them, the corresponding + * flags are set if 'g'or 'T' is encountered. + * Note: The 'T' and 'U' options do not conform to Perl. + * * Parameters : * 1 : optstring = string with options in perl syntax - * 2 : globalflag = see description + * 2 : flags = see description * * Returns : option integer suitable for pcre * *********************************************************************/ -int pcrs_compile_perl_options(char *optstring, int *globalflag) +int pcrs_compile_perl_options(char *optstring, int *flags) { - int i, rc = 0; - *globalflag = 0; + size_t i; + int rc = 0; + *flags = 0; for (i=0; i < strlen(optstring); i++) { switch(optstring[i]) { case 'e': break; - case 'g': *globalflag = 1; break; + case 'g': *flags |= PCRS_GLOBAL; break; case 'i': rc |= PCRE_CASELESS; break; case 'm': rc |= PCRE_MULTILINE; break; case 'o': break; case 's': rc |= PCRE_DOTALL; break; case 'x': rc |= PCRE_EXTENDED; break; case 'U': rc |= PCRE_UNGREEDY; break; + case 'T': *flags |= PCRS_TRIVIAL; break; default: break; } } @@ -215,7 +217,7 @@ int pcrs_compile_perl_options(char *optstring, int *globalflag) * the reason. * *********************************************************************/ -pcrs_substitute *pcrs_compile_replacement(char *replacement, int *errptr) +pcrs_substitute *pcrs_compile_replacement(char *replacement, int trivialflag, int *errptr) { int length, i, k = 0, l = 0, quoted = 0, idx; char *text, *num_ptr, *numbers = "0123456789"; @@ -235,57 +237,64 @@ pcrs_substitute *pcrs_compile_replacement(char *replacement, int *errptr) length = strlen(replacement); - for (i=0; i < length; i++) + if (trivialflag) { - /* Backslash treatment */ - if (replacement[i] == '\\') + k = length; + } + else + { + for (i=0; i < length; i++) { - if (quoted) - { - text[k++] = replacement[i]; - quoted = 0; - } - else + /* Backslash treatment */ + if (replacement[i] == '\\') { - quoted = 1; + if (quoted) + { + text[k++] = replacement[i]; + quoted = 0; + } + else + { + quoted = 1; + } + continue; } - continue; - } - /* Dollar treatment */ - if (replacement[i] == '$' && !quoted && i < length - 1) - { - if (strchr("0123456789&", replacement[i + 1]) == NULL) + /* Dollar treatment */ + if (replacement[i] == '$' && !quoted && i < length - 1) { - text[k++] = replacement[i]; - } - else - { - r->block_length[l] = k - r->block_offset[l]; - r->backref[l] = 0; - if (replacement[i + 1] != '&') + if (strchr("0123456789&", replacement[i + 1]) == NULL) + { + text[k++] = replacement[i]; + } + else { - while ((num_ptr = strchr(numbers, replacement[++i])) != NULL && i < length) + r->block_length[l] = k - r->block_offset[l]; + r->backref[l] = 0; + if (replacement[i + 1] != '&') { - idx = num_ptr - numbers; - r->backref[l] = r->backref[l] * 10 + idx; + while ((num_ptr = strchr(numbers, replacement[++i])) != NULL && i < length) + { + idx = num_ptr - numbers; + r->backref[l] = r->backref[l] * 10 + idx; + } + i--; } - i--; + else + i++; + if (r->backref[l] < PCRS_MAX_SUBMATCHES) + r->backref_count[r->backref[l]] += 1; + l++; + r->block_offset[l] = k; } - else - i++; - if (r->backref[l] < PCRS_MAX_SUBMATCHES) - r->backref_count[r->backref[l]] += 1; - l++; - r->block_offset[l] = k; + continue; } - continue; - } - /* Plain char treatment */ - text[k++] = replacement[i]; - quoted = 0; - } + /* Plain char treatment */ + text[k++] = replacement[i]; + quoted = 0; + } + } /* -END- if (!trivialflag) */ text[k] = '\0'; r->text = text; @@ -307,8 +316,8 @@ pcrs_substitute *pcrs_compile_replacement(char *replacement, int *errptr) * Parameters : * 1 : job = pointer to the pcrs_job structure to be freed * - * Returns : a pointer to the next job, if there was any, or - * NULL otherwise. + * Returns : a pointer to the next job, if there was any, or + * NULL otherwise. * *********************************************************************/ pcrs_job *pcrs_free_job(pcrs_job *job) @@ -335,15 +344,37 @@ pcrs_job *pcrs_free_job(pcrs_job *job) } +/********************************************************************* + * + * Function : pcrs_free_joblist + * + * Description : Iterates through a chained list of pcrs_job's and + * frees them using pcrs_free_job. + * + * Parameters : + * 1 : joblist = pointer to the first pcrs_job structure to + * be freed + * + * Returns : N/A + * + *********************************************************************/ +void pcrs_free_joblist(pcrs_job *joblist) +{ + while ( NULL != (joblist = pcrs_free_job(joblist)) ) {}; + + return; + +} + /********************************************************************* * - * Function : pcrs_make_job + * Function : pcrs_compile_command * - * Description : Main entry point. Takes a string with a Perl-style - * s/// command and returns a corresponding pcrs_job, - * or NULL if compiling the job fails at any stage. - * Diagnostics could obviously be improved. + * Description : Parses a string with a Perl-style s/// command, + * calls pcrs_compile, and returns a corresponding + * pcrs_job, or NULL if parsing or compiling the job + * fails. * * Parameters : * 1 : command = string with perl-style s/// command @@ -355,94 +386,68 @@ pcrs_job *pcrs_free_job(pcrs_job *job) * has the reason. * *********************************************************************/ -pcrs_job *pcrs_make_job(char *command, int *errptr) +pcrs_job *pcrs_compile_command(char *command, int *errptr) { - char *dummy, *token, delimiter; - const char *error; - int i = 0, globalflag; + int i, k, l, limit, quoted = FALSE; + char delimiter; + char *tokens[4]; pcrs_job *newjob; - /* Get and init memory */ - if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL) - { - *errptr = PCRS_ERR_NOMEM; - return NULL; - } - memset(newjob, '\0', sizeof(pcrs_job)); + i = k = l = 0; - /* Command too short? */ - if (strlen(command) < 4) + /* + * Tokenize the perl command + */ + limit = strlen(command); + if (limit < 4) { *errptr = PCRS_ERR_CMDSYNTAX; - pcrs_free_job(newjob); return NULL; } - - /* Split command into tokens and handle them */ - delimiter = command[1]; - token = (char *)malloc(strlen(command)); /* current token */ - dummy = (char *)malloc(strlen(command)); /* must store pattern, since we can't */ - /* use it until the options are known */ - while (my_strsep(token, &command, delimiter, '\\') >= 0) + else { - switch (i) - { - /* We don't care about the command and assume 's' */ - case 0: - break; + delimiter = command[1]; + } - /* The pattern */ - case 1: - strcpy(dummy, token); - break; + tokens[l] = (char *) malloc(limit + 1); - /* The substitute */ - case 2: - newjob->substitute = pcrs_compile_replacement(token, errptr); - if (newjob->substitute == NULL) - { - pcrs_free_job(newjob); - return NULL; - } - break; + for (i=0; i <= limit; i++) + { - /* The options */ - case 3: - newjob->options = pcrs_compile_perl_options(token, &globalflag); - newjob->globalflag = globalflag; + if (command[i] == delimiter && !quoted) + { + if (l == 3) + { + l = -1; break; + } + tokens[0][k++] = '\0'; + tokens[++l] = tokens[0] + k; + continue; + } - /* There shouldn't be anything else! */ - default: - *errptr = PCRS_ERR_CMDSYNTAX; - pcrs_free_job(newjob); - return NULL; + else if (command[i] == '\\' && !quoted && i+1 < limit && command[i+1] == delimiter) + { + quoted = TRUE; + continue; } - i++; + tokens[0][k++] = command[i]; + quoted = FALSE; } - free(token); - /* Compile the pattern */ - newjob->pattern = pcre_compile(dummy, newjob->options, &error, errptr, NULL); - if (newjob->pattern == NULL) - { - pcrs_free_job(newjob); - return NULL; - } - free(dummy); /* - * Generate hints. This has little overhead, since the - * hints will be NULL for a boring pattern anyway. + * Syntax error ? */ - newjob->hints = pcre_study(newjob->pattern, 0, &error); - if (error != NULL) + if (l != 3) { - *errptr = PCRS_ERR_STUDY; - pcrs_free_job(newjob); + *errptr = PCRS_ERR_CMDSYNTAX; + free(tokens[0]); return NULL; } + newjob = pcrs_compile(tokens[1], tokens[2], tokens[3], errptr); + free(tokens[0]); return newjob; } @@ -450,49 +455,97 @@ pcrs_job *pcrs_make_job(char *command, int *errptr) /********************************************************************* * - * Function : create_pcrs_job + * Function : pcrs_compile * - * Description : Create a job from all its components, if you don't - * have a Perl command to start from. Rather boring. + * Description : Takes the three arguments to a perl s/// command + * and compiles a pcrs_job structure from them. * * Parameters : - * 1 : pattern = pointer to pcre pattern - * 2 : hints = pointer to pcre hints - * 3 : options = options in pcre format - * 4 : globalflag = flag that indicates if global matching is desired - * 5 : substitute = pointer to pcrs_substitute data structure - * 2 : errptr = pointer to an integer in which error + * 1 : pattern = string with perl-style pattern + * 2 : substitute = string with perl-style substitute + * 3 : options = string with perl-style options + * 4 : errptr = pointer to an integer in which error * conditions can be returned. * - * Returns : pcrs_job structure, or NULL if an error was encountered. - * In that case, *errptr has the reason why. + * Returns : a corresponding pcrs_job data structure, or NULL + * if an error was encountered. In that case, *errptr + * has the reason. * *********************************************************************/ -pcrs_job *create_pcrs_job(pcre *pattern, pcre_extra *hints, int options, int globalflag, pcrs_substitute *substitute, int *errptr) +pcrs_job *pcrs_compile(char *pattern, char *substitute, char *options, int *errptr) { pcrs_job *newjob; + int flags; + const char *error; + + + /* + * Handle NULL arguments + */ + if (pattern == NULL) pattern = ""; + if (substitute == NULL) substitute = ""; + if (options == NULL) options = ""; + - if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL) + /* + * Get and init memory + */ + if (NULL == (newjob = (pcrs_job *)malloc(sizeof(pcrs_job)))) { *errptr = PCRS_ERR_NOMEM; return NULL; } memset(newjob, '\0', sizeof(pcrs_job)); - newjob->pattern = pattern; - newjob->hints = hints; - newjob->options = options; - newjob->globalflag = globalflag; - newjob->substitute = substitute; - return(newjob); + /* + * Evaluate the options + */ + newjob->options = pcrs_compile_perl_options(options, &flags); + newjob->flags = flags; + + + /* + * Compile the pattern + */ + newjob->pattern = pcre_compile(pattern, newjob->options, &error, errptr, NULL); + if (newjob->pattern == NULL) + { + pcrs_free_job(newjob); + return NULL; + } + + + /* + * Generate hints. This has little overhead, since the + * hints will be NULL for a boring pattern anyway. + */ + newjob->hints = pcre_study(newjob->pattern, 0, &error); + if (error != NULL) + { + *errptr = PCRS_ERR_STUDY; + pcrs_free_job(newjob); + return NULL; + } + + + /* + * Compile the substitute + */ + if (NULL == (newjob->substitute = pcrs_compile_replacement(substitute, newjob->flags & PCRS_TRIVIAL, errptr))) + { + pcrs_free_job(newjob); + return NULL; + } + + return newjob; } /********************************************************************* * - * Function : pcrs_exec_substitution + * Function : pcrs_execute * * Description : Modify the subject by executing the regular substitution * defined by the job. Since the result may be longer than @@ -504,44 +557,62 @@ pcrs_job *create_pcrs_job(pcre *pattern, pcre_extra *hints, int options, int glo * Parameters : * 1 : job = the pcrs_job to be executed * 2 : subject = the subject (== original) string - * 3 : subject_length = the subject's length + * 3 : subject_length = the subject's length + * INCLUDING the terminating zero, if string! * 4 : result = char** for returning the result * 5 : result_length = int* for returning the result's length * * Returns : the number of substitutions that were made. May be > 1 - * if job->globalflag was set + * if job->flags contained PCRS_GLOBAL * *********************************************************************/ -int pcrs_exec_substitution(pcrs_job *job, char *subject, int subject_length, char **result, int *result_length) +int pcrs_execute(pcrs_job *job, char *subject, int subject_length, char **result, int *result_length) { int offsets[3 * PCRS_MAX_SUBMATCHES], - offset = 0, i=0, k, matches_found, newsize, submatches; + offset, i, k, + matches_found, + newsize, + submatches; pcrs_match matches[PCRS_MAX_MATCHES]; char *result_offset; - /* Sanity first */ + offset = i = k = 0; + + /* + * Sanity check + */ if (job == NULL || job->pattern == NULL || job->substitute == NULL) { *result = NULL; return(PCRS_ERR_BADJOB); } + + /* + * Find the pattern and calculate the space + * requirements for the result (newsize) + */ newsize=subject_length; - /* Find.. */ - while ((submatches = pcre_exec(job->pattern, job->hints, subject, subject_length, offset, 0, offsets, 99)) > 0) + while ((submatches = pcre_exec(job->pattern, job->hints, subject, subject_length, offset, 0, offsets, 3 * PCRS_MAX_SUBMATCHES)) > 0) { + job->flags |= PCRS_SUCCESS; matches[i].submatches = submatches; for (k=0; k < submatches; k++) { matches[i].submatch_offset[k] = offsets[2 * k]; - matches[i].submatch_length[k] = offsets[2 * k + 1] - offsets[2 * k]; /* Non-found optional submatches have length -1-(-1)==0 */ - newsize += matches[i].submatch_length[k] * job->substitute->backref_count[k]; /* reserve mem for each submatch as often as it is ref'd */ + + /* Note: Non-found optional submatches have length -1-(-1)==0 */ + matches[i].submatch_length[k] = offsets[2 * k + 1] - offsets[2 * k]; + + /* reserve mem for each submatch as often as it is ref'd */ + newsize += matches[i].submatch_length[k] * job->substitute->backref_count[k]; } - newsize += strlen(job->substitute->text) - matches[i].submatch_length[0]; /* plus replacement text size minus match text size */ + /* plus replacement text size minus match text size */ + newsize += strlen(job->substitute->text) - matches[i].submatch_length[0]; /* Non-global search or limit reached? */ - if (++i >= PCRS_MAX_MATCHES || !job->globalflag ) break; + if (++i >= PCRS_MAX_MATCHES || !(job->flags & PCRS_GLOBAL) ) break; /* Don't loop on empty matches */ if (offsets[1] == offset) @@ -553,22 +624,30 @@ int pcrs_exec_substitution(pcrs_job *job, char *subject, int subject_length, cha else offset = offsets[1]; } - if (submatches < -1) return submatches; /* Pass pcre error through */ + /* Pass pcre error through if failiure */ + if (submatches < -1) return submatches; matches_found = i; - /* ..get memory ..*/ + + /* + * Get memory for the result + */ if ((*result = (char *)malloc(newsize)) == NULL) /* must be free()d by caller */ { return PCRS_ERR_NOMEM; } - /* ..and replace */ + + /* + * Replace + */ offset = 0; result_offset = *result; for (i=0; i < matches_found; i++) { - memcpy(result_offset, subject + offset, matches[i].submatch_offset[0] - offset); /* copy the chunk preceding the match */ + /* copy the chunk preceding the match */ + memcpy(result_offset, subject + offset, matches[i].submatch_offset[0] - offset); result_offset += matches[i].submatch_offset[0] - offset; /* For every segment of the substitute.. */ @@ -578,14 +657,14 @@ int pcrs_exec_substitution(pcrs_job *job, char *subject, int subject_length, cha memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]); result_offset += job->substitute->block_length[k]; - /* ..plus, if it's not the last chunk (i.e.: There IS a backref).. */ + /* ..plus, if it's not the last chunk, i.e.: There *is* a backref.. */ if (k != job->substitute->backrefs - /* ..and a nonempty match.. */ - && matches[i].submatch_length[job->substitute->backref[k]] > 0 - /* ..and in legal range, ... */ - && job->substitute->backref[k] <= PCRS_MAX_SUBMATCHES) + /* ..in legal range.. */ + && job->substitute->backref[k] <= PCRS_MAX_SUBMATCHES + /* ..and referencing a nonempty match.. */ + && matches[i].submatch_length[job->substitute->backref[k]] > 0) { - /* copy the submatch that is ref'd. */ + /* ..copy the submatch that is ref'd. */ memcpy( result_offset, subject + matches[i].submatch_offset[job->substitute->backref[k]],