From: Fabian Keil Date: Sat, 17 Jun 2023 11:20:24 +0000 (+0200) Subject: Add pcre2 support X-Git-Tag: v_4_0_0~187 X-Git-Url: http://www.privoxy.org/gitweb/%22https:/developer-manual/man-page/static/quickstart.html?a=commitdiff_plain;h=53748ca8ca3c893025be34dd4f104546fcbd0602;p=privoxy.git Add pcre2 support This is currently expected to cause crashes on Windows when compiled with GUI support. Closes bug #935. Initial patch submitted by: Gagan Sidhu --- diff --git a/acconfig.h b/acconfig.h index dbf443bb..9fbe5a72 100644 --- a/acconfig.h +++ b/acconfig.h @@ -225,11 +225,17 @@ /* Define if pcre.h must be included as */ #undef PCRE_H_IN_SUBDIR +#undef PCRE2_H_IN_SUBDIR + +#undef HAVE_PCRE2 +#undef HAVE_PCRE2POSIX /* Define if pcreposix.h must be included as */ #undef PCREPOSIX_H_IN_SUBDIR +#undef PCRE2POSIX_H_IN_SUBDIR + @BOTTOM@ /* diff --git a/actions.c b/actions.c index 7fbcd3a3..379c5e97 100644 --- a/actions.c +++ b/actions.c @@ -828,8 +828,12 @@ int update_action_bits_for_tag(struct client_state *csp, const char *tag) continue; } +#ifdef HAVE_PCRE2 + if (pcre2_pattern_matches(b->url->pattern.tag_regex, tag)) +#else /* and check if one of the tag patterns matches the tag, */ if (0 == regexec(b->url->pattern.tag_regex, tag, 0, NULL, 0)) +#endif { /* if it does, update the action bit map, */ if (merge_current_action(csp->action, b->action)) @@ -884,7 +888,11 @@ jb_err check_negative_tag_patterns(struct client_state *csp, unsigned int flag) } for (tag = csp->tags->first; NULL != tag; tag = tag->next) { +#ifdef HAVE_PCRE2 + if (pcre2_pattern_matches(b->url->pattern.tag_regex, tag->str)) +#else if (0 == regexec(b->url->pattern.tag_regex, tag->str, 0, NULL, 0)) +#endif { /* * The pattern matches at least one tag, thus the action diff --git a/cgi.c b/cgi.c index 5d7b7025..d60166f2 100644 --- a/cgi.c +++ b/cgi.c @@ -2023,7 +2023,7 @@ jb_err template_fill(char **template_ptr, const struct map *exports) char buf[BUFFER_SIZE]; char *tmp_out_buffer; char *file_buffer; - size_t size; + size_t buffer_size, new_size; int error; const char *flags; @@ -2032,7 +2032,7 @@ jb_err template_fill(char **template_ptr, const struct map *exports) assert(exports); file_buffer = *template_ptr; - size = strlen(file_buffer) + 1; + buffer_size = strlen(file_buffer) + 1; /* * Assemble pcrs joblist from exports map @@ -2082,7 +2082,10 @@ jb_err template_fill(char **template_ptr, const struct map *exports) } else { - error = pcrs_execute(job, file_buffer, size, &tmp_out_buffer, &size); + error = pcrs_execute(job, file_buffer, buffer_size, &tmp_out_buffer, + &new_size); + + buffer_size = new_size; pcrs_free_job(job); if (NULL == tmp_out_buffer) diff --git a/client-tags.c b/client-tags.c index 51e8a9c5..474e5695 100644 --- a/client-tags.c +++ b/client-tags.c @@ -43,6 +43,7 @@ #include "miscutil.h" #include "errlog.h" #include "parsers.h" +#include "urlmatch.h" struct client_specific_tag { @@ -658,7 +659,11 @@ int client_tag_match(const struct pattern_spec *pattern, for (tag = tags->first; tag != NULL; tag = tag->next) { +#ifdef HAVE_PCRE2 + if (pcre2_pattern_matches(pattern->pattern.tag_regex, tag->str)) +#else if (0 == regexec(pattern->pattern.tag_regex, tag->str, 0, NULL, 0)) +#endif { log_error(LOG_LEVEL_TAGGING, "Client tag '%s' matches.", tag->str); return 1; diff --git a/configure.in b/configure.in index c6e916e4..9189b8a9 100644 --- a/configure.in +++ b/configure.in @@ -863,12 +863,47 @@ else ]) fi +AC_ARG_ENABLE(pcre2, +[ --disable-pcre2 Don't try to use pcre2 even if it's available], +[enableval2=$enableval], +[enableval2=yes]) +if test $enableval2 = yes; then + try_pcre2=yes +else + AC_MSG_WARN([Ignoring pcre2 even if it's available]) + try_pcre2=no +fi + +if test $try_pcre2 != no; then dnl ================================================================= dnl Checks for libraries. dnl ================================================================= dnl Note: Some systems may have the library but not the system header dnl file, so we must check for both. dnl Also check for correct version +AC_CHECK_LIB(pcre2-8, pcre2_compile_8, [ + AC_CHECK_HEADER(pcre2.h, [ + AC_EGREP_HEADER(pcre2_pattern_info, pcre2.h,[have_pcre2=yes; AC_DEFINE(HAVE_PCRE2)], [AC_MSG_WARN([[pcre2 old version installed]]); have_pcre2=no]) + ], [ + AC_CHECK_HEADER(pcre2/pcre2.h, [ + AC_EGREP_HEADER(pcre2_pattern_info, pcre2/pcre2.h, [have_pcre2=yes; AC_DEFINE(PCRE2_H_IN_SUBDIR)], [AC_MSG_WARN([[pcre2 old version installed]]); have_pcre2=no]) + ], [have_pcre2=no]) + ], [#define PCRE2_CODE_UNIT_WIDTH 8]) +], [have_pcre2=no]) + +AC_CHECK_LIB(pcre2-posix, regcomp, [ + AC_CHECK_HEADER(pcre2posix.h, [ + AC_EGREP_HEADER(pcre2_regerror, pcre2posix.h, [have_pcre2posix=yes],[AC_MSG_WARN([[pcre2posix old version installed]]); have_pcre2posix=no]) + ], [ + AC_CHECK_HEADER(pcre/pcre2posix.h, [ + AC_EGREP_HEADER(pcre2_regerror, pcre2/pcre2posix.h, [have_pcre2posix=yes; AC_DEFINE(PCRE2POSIX_H_IN_SUBDIR)],[AC_MSG_WARN([[pcre2posix old version installed]]); have_pcre2posix=no]) + ], [have_pcre2posix=no]) + ]) +], [have_pcre2posix=no], -lpcre2-8) +fi + +if test $have_pcre2 = "no"; then + AC_CHECK_LIB(pcre, pcre_compile, [ AC_CHECK_HEADER(pcre.h, [ AC_EGREP_HEADER(pcre_fullinfo, pcre.h, [have_pcre=yes], [AC_MSG_WARN([[pcre old version installed]]); have_pcre=no]) @@ -889,6 +924,7 @@ AC_CHECK_LIB(pcreposix, regcomp, [ ]) ], [have_pcreposix=no], -lpcre) +fi dnl ================================================================ dnl libpcrs is temporarily disabled. dnl @@ -1095,6 +1131,31 @@ fi # we don't need pcreposix, then link pcre dynamically; else # build it and link statically # + +#check for libpcre2 first. then regular pcre + +if test $have_pcre2 = "yes"; then + echo "using libpcre2" + STATIC_PCRE_ONLY=# + LIBS="$LIBS -lpcre2-8 -lpcre2-posix" + if test "$use_static_pcre" = "yes"; then + pcre_dyn=no + AC_DEFINE(PCRE_STATIC,1,[Define to statically link to pcre library on Windows.]) +# see /usr/i686-w64-mingw32/sys-root/mingw/include/pcre.h line 54 +# #if defined(_WIN32) && !defined(PCRE_STATIC) +# # ifndef PCRE_EXP_DECL +# # define PCRE_EXP_DECL extern __declspec(dllimport) +# # endif +# If you want to statically link a program against a PCRE library in the form of +# a non-dll .a file, you must define PCRE_STATIC before including pcre.h or +# pcrecpp.h, otherwise the pcre_malloc() and pcre_free() exported functions will +# be declared __declspec(dllimport), with unwanted results. + else + pcre_dyn=yes + AC_DEFINE(FEATURE_DYNAMIC_PCRE,1,[Define to dynamically link to pcre.]) + fi +else + if test $have_pcre = "yes"; then echo "using libpcre" STATIC_PCRE_ONLY=# @@ -1116,7 +1177,8 @@ if test $have_pcre = "yes"; then AC_DEFINE(FEATURE_DYNAMIC_PCRE,1,[Define to dynamically link to pcre.]) fi else - AC_MSG_ERROR(pcre library not detected.) + AC_MSG_ERROR(Detected neither pcre2 nor pcre library.) +fi fi AC_DEFINE(FEATURE_CONNECTION_KEEP_ALIVE) diff --git a/pcrs.c b/pcrs.c index 007f7cc1..1875ba1a 100644 --- a/pcrs.c +++ b/pcrs.c @@ -57,7 +57,7 @@ * Internal prototypes */ -static int pcrs_parse_perl_options(const char *optstring, int *flags); +static int pcrs_parse_perl_options(const char *optstring, unsigned int *flags); static pcrs_substitute *pcrs_compile_replacement(const char *replacement, int trivialflag, int capturecount, int *errptr); static int is_hex_sequence(const char *sequence); @@ -83,25 +83,25 @@ const char *pcrs_strerror(const int error) switch (error) { /* Passed-through PCRE error: */ - case PCRE_ERROR_NOMEMORY: return "(pcre:) No memory"; + case PCREn(ERROR_NOMEMORY): return "(pcre:) No memory"; /* Shouldn't happen unless PCRE or PCRS bug, or user messed with compiled job: */ - case PCRE_ERROR_NULL: return "(pcre:) NULL code or subject or ovector"; - case PCRE_ERROR_BADOPTION: return "(pcre:) Unrecognized option bit"; - case PCRE_ERROR_BADMAGIC: return "(pcre:) Bad magic number in code"; + case PCREn(ERROR_NULL): return "(pcre:) NULL code or subject or ovector"; + case PCREn(ERROR_BADOPTION): return "(pcre:) Unrecognized option bit"; + case PCREn(ERROR_BADMAGIC): return "(pcre:) Bad magic number in code"; +#if defined(PCRE_ERROR_UNKNOWN_NODE) case PCRE_ERROR_UNKNOWN_NODE: return "(pcre:) Bad node in pattern"; - +#endif /* Can't happen / not passed: */ - case PCRE_ERROR_NOSUBSTRING: return "(pcre:) Fire in power supply"; - case PCRE_ERROR_NOMATCH: return "(pcre:) Water in power supply"; + case PCREn(ERROR_NOSUBSTRING): return "(pcre:) Fire in power supply"; + case PCREn(ERROR_NOMATCH): return "(pcre:) Water in power supply"; #ifdef PCRE_ERROR_MATCHLIMIT /* * Only reported by PCRE versions newer than our own. */ - case PCRE_ERROR_MATCHLIMIT: return "(pcre:) Match limit reached"; + case PCREn(ERROR_MATCHLIMIT): return "(pcre:) Match limit reached"; #endif /* def PCRE_ERROR_MATCHLIMIT */ - /* PCRS errors: */ case PCRS_ERR_NOMEM: return "(pcrs:) No memory"; case PCRS_ERR_CMDSYNTAX: return "(pcrs:) Syntax error while parsing command"; @@ -111,16 +111,14 @@ const char *pcrs_strerror(const int error) case PCRS_WARN_TRUNCATION: return "(pcrs:) At least one variable was too big and has been truncated before compilation"; - /* - * XXX: With the exception of PCRE_ERROR_MATCHLIMIT we - * only catch PCRE errors that can happen with our internal - * version. If Privoxy is linked against a newer - * PCRE version all bets are off ... - */ default: +#ifdef HAVE_PCRE2 + pcre2_get_error_message(error, (PCRE2_UCHAR8*)buf, sizeof(buf)); +#else snprintf(buf, sizeof(buf), "Error code %d. For details, check the pcre documentation.", error); +#endif return buf; } } @@ -149,7 +147,7 @@ const char *pcrs_strerror(const int error) * Returns : option integer suitable for pcre * *********************************************************************/ -static int pcrs_parse_perl_options(const char *optstring, int *flags) +static int pcrs_parse_perl_options(const char *optstring, unsigned int *flags) { size_t i; int rc = 0; @@ -163,13 +161,13 @@ static int pcrs_parse_perl_options(const char *optstring, int *flags) { case 'e': break; /* ToDo ;-) */ case 'g': *flags |= PCRS_GLOBAL; break; - case 'i': rc |= PCRE_CASELESS; break; - case 'm': rc |= PCRE_MULTILINE; break; + case 'i': rc |= PCREn(CASELESS); break; + case 'm': rc |= PCREn(MULTILINE); break; case 'o': break; - case 's': rc |= PCRE_DOTALL; break; - case 'x': rc |= PCRE_EXTENDED; break; + case 's': rc |= PCREn(DOTALL); break; + case 'x': rc |= PCREn(EXTENDED); break; case 'D': *flags |= PCRS_DYNAMIC; break; - case 'U': rc |= PCRE_UNGREEDY; break; + case 'U': rc |= PCREn(UNGREEDY); break; case 'T': *flags |= PCRS_TRIVIAL; break; default: break; } @@ -471,7 +469,15 @@ pcrs_job *pcrs_free_job(pcrs_job *job) else { next = job->next; - if (job->pattern != NULL) free(job->pattern); + if (job->pattern != NULL) + { +#ifdef HAVE_PCRE2 + pcre2_code_free(job->pattern); +#else + free(job->pattern); +#endif + } +#ifndef HAVE_PCRE2 if (job->hints != NULL) { #ifdef PCRE_CONFIG_JIT @@ -480,6 +486,7 @@ pcrs_job *pcrs_free_job(pcrs_job *job) free(job->hints); #endif } +#endif if (job->substitute != NULL) { if (job->substitute->text != NULL) free(job->substitute->text); @@ -626,10 +633,14 @@ pcrs_job *pcrs_compile_command(const char *command, int *errptr) pcrs_job *pcrs_compile(const char *pattern, const char *substitute, const char *options, int *errptr) { pcrs_job *newjob; - int flags; + unsigned int flags; int capturecount; - const char *error; +#ifdef HAVE_PCRE2 + int ret; +#else int pcre_study_options = 0; + const char *error; +#endif *errptr = 0; @@ -661,25 +672,43 @@ pcrs_job *pcrs_compile(const char *pattern, const char *substitute, const char * /* * Compile the pattern */ +#ifdef HAVE_PCRE2 + PCRE2_SIZE error_offset; + newjob->pattern = pcre2_compile((const unsigned char *)pattern, + PCRE2_ZERO_TERMINATED, (unsigned)newjob->options, errptr, + &error_offset, NULL); +#else newjob->pattern = pcre_compile(pattern, newjob->options, &error, errptr, NULL); +#endif if (newjob->pattern == NULL) { pcrs_free_job(newjob); return NULL; } - -#ifdef PCRE_STUDY_JIT_COMPILE +#if defined(PCRE_STUDY_JIT_COMPILE) || defined(HAVE_PCRE2) #ifdef DISABLE_PCRE_JIT_COMPILATION #warning PCRE_STUDY_JIT_COMPILE is supported but Privoxy has been configured not to use it #else if (!(flags & PCRS_DYNAMIC)) { +#ifdef HAVE_PCRE2 + /* Try to enable JIT compilation but continue if it's unsupported. */ + if ((ret = pcre2_jit_compile(newjob->pattern, PCRE2_JIT_COMPLETE)) && + (ret != PCRE2_ERROR_JIT_BADOPTION)) + { + *errptr = ret; + pcrs_free_job(newjob); + return NULL; + } +#else pcre_study_options = PCRE_STUDY_JIT_COMPILE; +#endif } #endif #endif +#ifndef HAVE_PCRE2 /* * Generate hints. This has little overhead, since the * hints will be NULL for a boring pattern anyway. @@ -691,13 +720,17 @@ pcrs_job *pcrs_compile(const char *pattern, const char *substitute, const char * pcrs_free_job(newjob); return NULL; } - +#endif /* * Determine the number of capturing subpatterns. * This is needed for handling $+ in the substitute. */ +#ifdef HAVE_PCRE2 + if (0 > (*errptr = pcre2_pattern_info(newjob->pattern, PCRE2_INFO_CAPTURECOUNT, &capturecount))) +#else if (0 > (*errptr = pcre_fullinfo(newjob->pattern, newjob->hints, PCRE_INFO_CAPTURECOUNT, &capturecount))) +#endif { pcrs_free_job(newjob); return NULL; @@ -809,14 +842,20 @@ int pcrs_execute_list(pcrs_job *joblist, char *subject, size_t subject_length, c *********************************************************************/ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char **result, size_t *result_length) { - int offsets[3 * PCRS_MAX_SUBMATCHES], - offset, + int offset, i, k, matches_found, submatches, max_matches = PCRS_MAX_MATCH_INIT; size_t newsize; +#ifdef HAVE_PCRE2 + pcrs_match *matches, *dummy; + pcre2_match_data *pcre2_matches; + size_t *offsets; +#else pcrs_match *matches, *dummy; + int offsets[3 * PCRS_MAX_SUBMATCHES]; +#endif char *result_offset; offset = i = 0; @@ -830,27 +869,38 @@ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char return(PCRS_ERR_BADJOB); } +#ifdef HAVE_PCRE2 + if (NULL == (pcre2_matches = pcre2_match_data_create_from_pattern(job->pattern, NULL))) + { + return(PCRS_ERR_NOMEM); + } + offsets = pcre2_get_ovector_pointer(pcre2_matches); +#endif if (NULL == (matches = (pcrs_match *)malloc((size_t)max_matches * sizeof(pcrs_match)))) { return(PCRS_ERR_NOMEM); } memset(matches, '\0', (size_t)max_matches * sizeof(pcrs_match)); - /* * Find the pattern and calculate the space * requirements for the result */ newsize = subject_length; +#ifdef HAVE_PCRE2 + while ((submatches = pcre2_match(job->pattern, (const unsigned char *)subject, + subject_length, (size_t)offset, 0, pcre2_matches, NULL)) > 0) +#else while ((submatches = pcre_exec(job->pattern, job->hints, subject, (int)subject_length, offset, 0, offsets, 3 * PCRS_MAX_SUBMATCHES)) > 0) +#endif { job->flags |= PCRS_SUCCESS; matches[i].submatches = submatches; for (k = 0; k < submatches; k++) { - matches[i].submatch_offset[k] = offsets[2 * k]; + matches[i].submatch_offset[k] = (int)offsets[2 * k]; /* Note: Non-found optional submatches have length -1-(-1)==0 */ matches[i].submatch_length[k] = (size_t)(offsets[2 * k + 1] - offsets[2 * k]); @@ -867,7 +917,7 @@ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char newsize += (size_t)offsets[0] * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES]; /* chunk after match */ - matches[i].submatch_offset[PCRS_MAX_SUBMATCHES + 1] = offsets[1]; + matches[i].submatch_offset[PCRS_MAX_SUBMATCHES + 1] = (int)offsets[1]; matches[i].submatch_length[PCRS_MAX_SUBMATCHES + 1] = subject_length - (size_t)offsets[1] - 1; newsize += (subject_length - (size_t)offsets[1]) * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES + 1]; @@ -894,12 +944,19 @@ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char break; /* Go find the next one */ else - offset = offsets[1]; + offset = (int)offsets[1]; } /* Pass pcre error through if (bad) failure */ +#ifdef HAVE_PCRE2 + if (submatches < PCRE2_ERROR_NOMATCH) +#else if (submatches < PCRE_ERROR_NOMATCH) +#endif { free(matches); +#ifdef HAVE_PCRE2 + pcre2_match_data_free(pcre2_matches); +#endif return submatches; } matches_found = i; @@ -909,9 +966,19 @@ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char * Get memory for the result (must be freed by caller!) * and append terminating null byte. */ - if ((*result = (char *)malloc(newsize + 1)) == NULL) + if ((*result = (char *)malloc(newsize + 1 +#ifdef HAVE_PCRE2 + /* + * Work around to prevent invalid reads in the jit code. + */ + + 16 +#endif + )) == NULL) { free(matches); +#ifdef HAVE_PCRE2 + pcre2_match_data_free(pcre2_matches); +#endif return PCRS_ERR_NOMEM; } else @@ -964,6 +1031,9 @@ int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char memcpy(result_offset, subject + offset, subject_length - (size_t)offset); *result_length = newsize; +#ifdef HAVE_PCRE2 + pcre2_match_data_free(pcre2_matches); +#endif free(matches); return matches_found; @@ -1101,7 +1171,7 @@ char pcrs_get_delimiter(const char *string) *********************************************************************/ char *pcrs_execute_single_command(const char *subject, const char *pcrs_command, int *hits) { - size_t size; + size_t buffer_size, new_size; char *result = NULL; pcrs_job *job; @@ -1109,12 +1179,14 @@ char *pcrs_execute_single_command(const char *subject, const char *pcrs_command, assert(pcrs_command); *hits = 0; - size = strlen(subject); + buffer_size = strlen(subject); job = pcrs_compile_command(pcrs_command, hits); if (NULL != job) { - *hits = pcrs_execute(job, subject, size, &result, &size); + *hits = pcrs_execute(job, subject, buffer_size, &result, &new_size); + buffer_size = new_size; + if (*hits < 0) { freez(result); diff --git a/pcrs.h b/pcrs.h index abff3caa..32cf37c2 100644 --- a/pcrs.h +++ b/pcrs.h @@ -33,9 +33,18 @@ *********************************************************************/ +#ifdef HAVE_PCRE2 +#define PCRE2_CODE_UNIT_WIDTH 8 +#define PCREn(x) PCRE2_ ## x +#ifndef _PCRE2_H +#include +#endif +#else +#define PCREn(x) PCRE_ ## x #ifndef _PCRE_H #include #endif +#endif /* * Constants: @@ -55,22 +64,23 @@ * They are supposed to be handled together with PCRE error * codes and have to start with an offset to prevent overlaps. * - * PCRE 6.7 uses error codes from -1 to -21, PCRS error codes - * below -100 should be safe for a while. + * PCRE 6.7 uses error codes from -1 to -21, + * PCRE2 10.42 uses error codes from -66 to 101. + * PCRS error codes below -300 should be safe for a while. */ -#define PCRS_ERR_NOMEM -100 /* Failed to acquire memory. */ -#define PCRS_ERR_CMDSYNTAX -101 /* Syntax of s///-command */ -#define PCRS_ERR_STUDY -102 /* pcre error while studying the pattern */ -#define PCRS_ERR_BADJOB -103 /* NULL job pointer, pattern or substitute */ -#define PCRS_WARN_BADREF -104 /* Backreference out of range */ -#define PCRS_WARN_TRUNCATION -105 /* At least one pcrs variable was too big, +#define PCRS_ERR_NOMEM -300 /* Failed to acquire memory. */ +#define PCRS_ERR_CMDSYNTAX -301 /* Syntax of s///-command */ +#define PCRS_ERR_STUDY -302 /* pcre error while studying the pattern */ +#define PCRS_ERR_BADJOB -303 /* NULL job pointer, pattern or substitute */ +#define PCRS_WARN_BADREF -304 /* Backreference out of range */ +#define PCRS_WARN_TRUNCATION -305 /* At least one pcrs variable was too big, * only the first part was used. */ /* Flags */ -#define PCRS_GLOBAL 1 /* Job should be applied globally, as with perl's g option */ -#define PCRS_TRIVIAL 2 /* Backreferences in the substitute are ignored */ -#define PCRS_SUCCESS 4 /* Job did previously match */ -#define PCRS_DYNAMIC 8 /* Job is dynamic (used to disable JIT compilation) */ +#define PCRS_GLOBAL 0x08000000u /* Job should be applied globally, as with perl's g option */ +#define PCRS_TRIVIAL 0x10000000u /* Backreferences in the substitute are ignored */ +#define PCRS_SUCCESS 0x20000000u /* Job did previously match */ +#define PCRS_DYNAMIC 0x40000000u /* Job is dynamic (used to disable JIT compilation) */ /* @@ -107,10 +117,14 @@ typedef struct { /* A PCRS job */ typedef struct PCRS_JOB { +#ifdef HAVE_PCRE2 + pcre2_code *pattern; +#else pcre *pattern; /* The compiled pcre pattern */ pcre_extra *hints; /* The pcre hints for the pattern */ +#endif int options; /* The pcre options (numeric) */ - int flags; /* The pcrs and user flags (see "Flags" above) */ + unsigned int flags; /* The pcrs and user flags (see "Flags" above) */ pcrs_substitute *substitute; /* The compiled pcrs substitute */ struct PCRS_JOB *next; /* Pointer for chaining jobs to joblists */ } pcrs_job; diff --git a/project.h b/project.h index e8bb6788..b203ad33 100644 --- a/project.h +++ b/project.h @@ -94,12 +94,38 @@ */ #ifdef STATIC_PCRE +#ifdef HAVE_PCRE2 +# include "pcre2.h" +# include "pcre2posix.h" +#else # include "pcre.h" +# include "pcreposix.h" +#endif #else -# ifdef PCRE_H_IN_SUBDIR -# include +# ifdef HAVE_PCRE2 +# ifdef PCRE2_H_IN_SUBDIR +# define PCRE2_CODE_UNIT_WIDTH 8 +# include +# else +# define PCRE2_CODE_UNIT_WIDTH 8 +# include +# endif +# ifdef PCRE2POSIX_H_IN_SUBDIR +# include +# else +# include +# endif # else -# include +# ifdef PCRE_H_IN_SUBDIR +# include +# else +# include +# endif +# ifdef PCREPOSIX_H_IN_SUBDIR +# include +# else +# include +# endif # endif #endif @@ -109,16 +135,6 @@ # include #endif -#ifdef STATIC_PCRE -# include "pcreposix.h" -#else -# ifdef PCRE_H_IN_SUBDIR -# include -# else -# include -# endif -#endif - #ifdef _WIN32 /* * I don't want to have to #include all this just for the declaration @@ -404,10 +420,16 @@ struct http_response enum crunch_reason crunch_reason; /**< Why the response was generated in the first place. */ }; +#ifdef HAVE_PCRE2 +#define REGEX_TYPE pcre2_code +#else +#define REGEX_TYPE regex_t +#endif + struct url_spec { #ifdef FEATURE_PCRE_HOST_PATTERNS - regex_t *host_regex;/**< Regex for host matching */ + REGEX_TYPE *host_regex;/**< Regex for host matching */ enum host_regex_type { VANILLA_HOST_PATTERN, PCRE_HOST_PATTERN } host_regex_type; #endif /* defined FEATURE_PCRE_HOST_PATTERNS */ int dcount; /**< How many parts to this domain? (length of dvec) */ @@ -417,7 +439,7 @@ struct url_spec char *port_list; /**< List of acceptable ports, or NULL to match all ports */ - regex_t *preg; /**< Regex for matching path part */ + REGEX_TYPE *preg; /**< Regex for matching path part */ }; /** @@ -432,7 +454,7 @@ struct pattern_spec union { struct url_spec url_spec; - regex_t *tag_regex; + REGEX_TYPE *tag_regex; } pattern; unsigned int flags; /**< Bitmap with various pattern properties. */ diff --git a/templates/show-status b/templates/show-status index 896c86d2..a6aaf8bf 100644 --- a/templates/show-status +++ b/templates/show-status @@ -298,10 +298,7 @@ FEATURE_DYNAMIC_PCRE @if-FEATURE_DYNAMIC_PCRE-then@ Yes @else-not-FEATURE_DYNAMIC_PCRE@ No @endif-FEATURE_DYNAMIC_PCRE@ - Dynamically link to the PCRE library. This is set automatically - by ./configure if you do not have libpcre installed. - Dynamically linking to an external libpcre is recommended as the one that is distributed - with Privoxy itself is outdated and lacks various features and bug-fixes you may be interested in. + Dynamically link to the PCRE(2) library (recommended). FEATURE_EXTENDED_STATISTICS diff --git a/urlmatch.c b/urlmatch.c index 1e10c077..84e9d298 100644 --- a/urlmatch.c +++ b/urlmatch.c @@ -604,6 +604,100 @@ jb_err parse_http_request(const char *req, struct http_request *http) } +#ifdef HAVE_PCRE2 +/********************************************************************* + * + * Function : compile_pattern + * + * Description : Compiles a host, domain or TAG pattern. + * + * Parameters : + * 1 : pattern = The pattern to compile. + * 2 : anchoring = How the regex should be modified + * before compilation. Can be either + * one of NO_ANCHORING, LEFT_ANCHORED, + * RIGHT_ANCHORED or RIGHT_ANCHORED_HOST. + * 3 : url = In case of failures, the spec member is + * logged and the structure freed. + * 4 : regex = Where the compiled regex should be stored. + * + * Returns : JB_ERR_OK - Success + * JB_ERR_PARSE - Cannot parse regex + * + *********************************************************************/ +static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring, + struct pattern_spec *url, pcre2_code **regex) +{ + int errcode; + const char *fmt = NULL; + char *rebuf; + size_t rebuf_size; + PCRE2_SIZE error_offset; + int ret; + + assert(pattern); + + if (pattern[0] == '\0') + { + *regex = NULL; + return JB_ERR_OK; + } + + switch (anchoring) + { + case NO_ANCHORING: + fmt = "%s"; + break; + case RIGHT_ANCHORED: + fmt = "%s$"; + break; + case RIGHT_ANCHORED_HOST: + fmt = "%s\\.?$"; + break; + case LEFT_ANCHORED: + fmt = "^%s"; + break; + default: + log_error(LOG_LEVEL_FATAL, + "Invalid anchoring in compile_pattern %d", anchoring); + } + rebuf_size = strlen(pattern) + strlen(fmt); + rebuf = malloc_or_die(rebuf_size); + + snprintf(rebuf, rebuf_size, fmt, pattern); + + *regex = pcre2_compile((const unsigned char *)pattern, + PCRE2_ZERO_TERMINATED, PCRE2_CASELESS, &errcode, + &error_offset, NULL); + if (*regex == NULL) + { + log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s", + pattern, url->spec, rebuf); + freez(rebuf); + + return JB_ERR_PARSE; + } + +#ifndef DISABLE_PCRE_JIT_COMPILATION + /* Try to enable JIT compilation but continue if it's unsupported. */ + if ((ret = pcre2_jit_compile(*regex, PCRE2_JIT_COMPLETE)) && + (ret != PCRE2_ERROR_JIT_BADOPTION)) + { + log_error(LOG_LEVEL_ERROR, + "Unexpected error enabling JIT compilation for %s from %s: %s", + pattern, url->spec, rebuf); + freez(rebuf); + + return JB_ERR_PARSE; + } +#endif + + freez(rebuf); + + return JB_ERR_OK; + +} +#else /********************************************************************* * * Function : compile_pattern @@ -686,6 +780,7 @@ static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchorin return JB_ERR_OK; } +#endif /********************************************************************* @@ -1051,6 +1146,49 @@ static int simplematch(const char *pattern, const char *text) } +#ifdef HAVE_PCRE2 +/********************************************************************* + * + * Function : pcre2_pattern_matches + * + * Description : Checks if a compiled pcre2 pattern matches a string. + * + * Parameters : + * 1 : pattern = The compiled pattern + * 2 : string = The string to check + * + * Returns : TRUE for yes, FALSE otherwise. + * + *********************************************************************/ +int pcre2_pattern_matches(const pcre2_code *pattern, const char *string) +{ + PCRE2_SIZE offset; + int ret; + pcre2_match_data *pcre2_matches; + + assert(pattern != NULL); + assert(string != NULL); + + offset = 0; + + pcre2_matches = pcre2_match_data_create_from_pattern(pattern, NULL); + if (NULL == pcre2_matches) + { + log_error(LOG_LEVEL_ERROR, + "Out of memory while matching pattern against %s", string); + return FALSE; + } + + ret = pcre2_match(pattern, (const unsigned char *)string, strlen(string), + offset, 0, pcre2_matches, NULL); + + pcre2_match_data_free(pcre2_matches); + + return (ret >= 0); +} +#endif + + /********************************************************************* * * Function : simple_domaincmp @@ -1268,8 +1406,12 @@ void free_pattern_spec(struct pattern_spec *pattern) { if (pattern->pattern.tag_regex) { +#ifdef HAVE_PCRE2 + pcre2_code_free(pattern->pattern.tag_regex); +#else regfree(pattern->pattern.tag_regex); freez(pattern->pattern.tag_regex); +#endif } return; } @@ -1277,8 +1419,12 @@ void free_pattern_spec(struct pattern_spec *pattern) #ifdef FEATURE_PCRE_HOST_PATTERNS if (pattern->pattern.url_spec.host_regex) { +#ifdef HAVE_PCRE2 + pcre2_code_free(pattern->pattern.url_spec.host_regex); +#else regfree(pattern->pattern.url_spec.host_regex); freez(pattern->pattern.url_spec.host_regex); +#endif } #endif /* def FEATURE_PCRE_HOST_PATTERNS */ freez(pattern->pattern.url_spec.dbuffer); @@ -1287,8 +1433,12 @@ void free_pattern_spec(struct pattern_spec *pattern) freez(pattern->pattern.url_spec.port_list); if (pattern->pattern.url_spec.preg) { +#ifdef HAVE_PCRE2 + pcre2_code_free(pattern->pattern.url_spec.preg); +#else regfree(pattern->pattern.url_spec.preg); freez(pattern->pattern.url_spec.preg); +#endif } } @@ -1333,8 +1483,13 @@ static int host_matches(const struct http_request *http, if (pattern->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN) { return ((NULL == pattern->pattern.url_spec.host_regex) +#ifdef HAVE_PCRE2 + || pcre2_pattern_matches(pattern->pattern.url_spec.host_regex, + http->host)); +#else || (0 == regexec(pattern->pattern.url_spec.host_regex, http->host, 0, NULL, 0))); +#endif } #endif return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http))); @@ -1357,7 +1512,11 @@ static int host_matches(const struct http_request *http, static int path_matches(const char *path, const struct pattern_spec *pattern) { return ((NULL == pattern->pattern.url_spec.preg) +#ifdef HAVE_PCRE2 + || (pcre2_pattern_matches(pattern->pattern.url_spec.preg, path))); +#else || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0))); +#endif } diff --git a/urlmatch.h b/urlmatch.h index 315e8b24..8643aa4e 100644 --- a/urlmatch.h +++ b/urlmatch.h @@ -50,6 +50,10 @@ extern int url_requires_percent_encoding(const char *url); extern int url_match(const struct pattern_spec *pattern, const struct http_request *http); +#ifdef HAVE_PCRE2 +extern int pcre2_pattern_matches(const pcre2_code *pattern, const char *string); +#endif + extern jb_err create_pattern_spec(struct pattern_spec *url, char *buf); extern void free_pattern_spec(struct pattern_spec *url); extern int match_portlist(const char *portlist, int port); diff --git a/w32log.c b/w32log.c index c7e3540d..7500edaf 100644 --- a/w32log.c +++ b/w32log.c @@ -316,6 +316,9 @@ void TermLogWindow(void) void LogCreatePatternMatchingBuffers(void) { int i; +#ifdef HAVE_PCRE2 +#warning The win32 build of Privoxy is expected to crash when compiled with pcre2 support. +#endif for (i = 0; patterns_to_highlight[i].str != NULL; i++) { regcomp(&patterns_to_highlight[i].buffer, patterns_to_highlight[i].str, REG_ICASE);