1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2000 University of Cambridge
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
36 /* Define DEBUG to get debugging output on stdout. */
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
45 #define DPRINTF(p) printf p
47 #define DPRINTF(p) /*nothing*/
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
56 /* Allow compilation as C++ source code, should anybody want to do that. */
59 #define class pcre_class
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
66 #define BRASTACK_SIZE 200
69 /* The number of bytes in a literal character string above which we can't add
70 any more is different when UTF-8 characters may be encountered. */
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
84 /* Text forms of OP_ values and things, for debugging (not all used) */
87 static const char *OP_names[] = {
88 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90 "Opt", "^", "$", "Any", "chars", "not",
91 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94 "*", "*?", "+", "+?", "?", "??", "{", "{",
95 "class", "Ref", "Recurse",
96 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 "Brazero", "Braminzero", "Bra"
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. */
135 static const int posix_class_maps[] = {
136 cbit_lower, cbit_upper, -1, /* alpha */
137 cbit_lower, -1, -1, /* lower */
138 cbit_upper, -1, -1, /* upper */
139 cbit_digit, cbit_lower, cbit_upper, /* alnum */
140 cbit_print, cbit_cntrl, -1, /* ascii */
141 cbit_cntrl, -1, -1, /* cntrl */
142 cbit_digit, -1, -1, /* digit */
143 cbit_graph, -1, -1, /* graph */
144 cbit_print, -1, -1, /* print */
145 cbit_punct, -1, -1, /* punct */
146 cbit_space, -1, -1, /* space */
147 cbit_word, -1, -1, /* word */
148 cbit_xdigit,-1, -1 /* xdigit */
152 /* Definition to allow mutual recursion */
155 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 BOOL, int, int *, int *, compile_data *);
158 /* Structure for building a chain of data that actually lives on the
159 stack, for holding the values of the subject pointer at the start of each
160 subpattern, so as to detect when an empty string has been matched by a
161 subpattern - to break infinite loops. */
163 typedef struct eptrblock {
164 struct eptrblock *prev;
165 const uschar *saved_eptr;
168 /* Flag bits for the match() function */
170 #define match_condassert 0x01 /* Called to check a condition assertion */
171 #define match_isgroup 0x02 /* Set if start of bracketed group */
175 /*************************************************
177 *************************************************/
179 /* PCRE is thread-clean and doesn't use any global variables in the normal
180 sense. However, it calls memory allocation and free functions via the two
181 indirections below, which are can be changed by the caller, but are shared
182 between all threads. */
184 void *(*pcre_malloc)(size_t) = malloc;
185 void (*pcre_free)(void *) = free;
189 /*************************************************
190 * Macros and tables for character handling *
191 *************************************************/
193 /* When UTF-8 encoding is being used, a character is no longer just a single
194 byte. The macros for character handling generate simple sequences when used in
195 byte-mode, and more complicated ones for UTF-8 characters. */
198 #define GETCHARINC(c, eptr) c = *eptr++;
199 #define GETCHARLEN(c, eptr, len) c = *eptr;
200 #define BACKCHAR(eptr)
202 #else /* SUPPORT_UTF8 */
204 /* Get the next UTF-8 character, advancing the pointer */
206 #define GETCHARINC(c, eptr) \
208 if (md->utf8 && (c & 0xc0) == 0xc0) \
210 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211 int s = 6 - a; /* Amount to shift next byte */ \
212 c &= utf8_table3[a]; /* Low order bits from first byte */ \
215 c |= (*eptr++ & 0x3f) << s; \
220 /* Get the next UTF-8 character, not advancing the pointer, setting length */
222 #define GETCHARLEN(c, eptr, len) \
225 if (md->utf8 && (c & 0xc0) == 0xc0) \
228 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229 int s = 6 - a; /* Amount to shift next byte */ \
230 c &= utf8_table3[a]; /* Low order bits from first byte */ \
231 for (i = 1; i <= a; i++) \
233 c |= (eptr[i] & 0x3f) << s; \
239 /* If the pointer is not at the start of a character, move it back until
242 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
248 /*************************************************
249 * Default character tables *
250 *************************************************/
252 /* A default set of character tables is included in the PCRE binary. Its source
253 is built by the maketables auxiliary program, which uses the default C ctypes
254 functions, and put in the file chartables.c. These tables are used by PCRE
255 whenever the caller of pcre_compile() does not provide an alternate set of
258 #include "chartables.c"
263 /*************************************************
264 * Tables for UTF-8 support *
265 *************************************************/
267 /* These are the breakpoints for different numbers of bytes in a UTF-8
270 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
272 /* These are the indicator bits and the mask for the data bits to set in the
273 first byte of a character, indexed by the number of additional bytes. */
275 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
278 /* Table of the number of extra characters, indexed by the first character
279 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
282 static uschar utf8_table4[] = {
283 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
289 /*************************************************
290 * Convert character value to UTF-8 *
291 *************************************************/
293 /* This function takes an integer value in the range 0 - 0x7fffffff
294 and encodes it as a UTF-8 character in 0 to 6 bytes.
297 cvalue the character value
298 buffer pointer to buffer for result - at least 6 bytes long
300 Returns: number of characters placed in the buffer
304 ord2utf8(int cvalue, uschar *buffer)
307 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308 if (cvalue <= utf8_table1[i]) break;
309 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
311 for (j = 0; j < i; j++)
313 *buffer++ = 0x80 | (cvalue & 0x3f);
322 /*************************************************
323 * Return version string *
324 *************************************************/
326 #define STRING(a) # a
327 #define XSTRING(s) STRING(s)
332 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
338 /*************************************************
339 * (Obsolete) Return info about compiled pattern *
340 *************************************************/
342 /* This is the original "info" function. It picks potentially useful data out
343 of the private structure, but its interface was too rigid. It remains for
344 backwards compatibility. The public options are passed back in an int - though
345 the re->options field has been expanded to a long int, all the public options
346 at the low end of it, and so even on 16-bit systems this will still be OK.
347 Therefore, I haven't changed the API for pcre_info().
350 external_re points to compiled code
351 optptr where to pass back the options
352 first_char where to pass back the first character,
353 or -1 if multiline and all branches start ^,
356 Returns: number of capturing subpatterns
357 or negative values on error
361 pcre_info(const pcre *external_re, int *optptr, int *first_char)
363 const real_pcre *re = (const real_pcre *)external_re;
364 if (re == NULL) return PCRE_ERROR_NULL;
365 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 if (first_char != NULL)
368 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370 return re->top_bracket;
375 /*************************************************
376 * Return info about compiled pattern *
377 *************************************************/
379 /* This is a newer "info" function which has an extensible interface so
380 that additional items can be added compatibly.
383 external_re points to compiled code
384 external_study points to study data, or NULL
385 what what information is required
386 where where to put the information
388 Returns: 0 if data returned, negative on error
392 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
395 const real_pcre *re = (const real_pcre *)external_re;
396 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
398 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403 case PCRE_INFO_OPTIONS:
404 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408 *((size_t *)where) = re->size;
411 case PCRE_INFO_CAPTURECOUNT:
412 *((int *)where) = re->top_bracket;
415 case PCRE_INFO_BACKREFMAX:
416 *((int *)where) = re->top_backref;
419 case PCRE_INFO_FIRSTCHAR:
421 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
425 case PCRE_INFO_FIRSTTABLE:
426 *((const uschar **)where) =
427 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428 study->start_bits : NULL;
431 case PCRE_INFO_LASTLITERAL:
433 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
436 default: return PCRE_ERROR_BADOPTION;
445 /*************************************************
446 * Debugging function to print chars *
447 *************************************************/
449 /* Print a sequence of chars in printable format, stopping at the end of the
450 subject if the requested.
453 p points to characters
454 length number to print
455 is_subject TRUE if printing from within md->start_subject
456 md pointer to matching data block, if is_subject is TRUE
462 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
465 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
467 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
474 /*************************************************
476 *************************************************/
478 /* This function is called when a \ has been encountered. It either returns a
479 positive value for a simple escape such as \n, or a negative value which
480 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481 a positive value greater than 255 may be returned. On entry, ptr is pointing at
482 the \. On exit, it is on the final character of the escape sequence.
485 ptrptr points to the pattern position pointer
486 errorptr points to the pointer to the error message
487 bracount number of previous extracting brackets
488 options the options bits
489 isclass TRUE if inside a character class
490 cd pointer to char tables block
492 Returns: zero or positive => a data character
493 negative => a special escape sequence
494 on error, errorptr is set
498 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 int options, BOOL isclass, compile_data *cd)
501 const uschar *ptr = *ptrptr;
504 /* If backslash is at the end of the pattern, it's an error. */
507 if (c == 0) *errorptr = ERR1;
509 /* Digits or letters may have special meaning; all others are literals. */
511 else if (c < '0' || c > 'z') {}
513 /* Do an initial lookup in a table. A non-zero result is something that can be
514 returned immediately. Otherwise further processing may be required. */
516 else if ((i = escapes[c - '0']) != 0) c = i;
518 /* Escapes that need further processing, or are illegal. */
522 const uschar *oldptr;
525 /* The handling of escape sequences consisting of a string of digits
526 starting with one that is not zero is not straightforward. By experiment,
527 the way Perl works seems to be as follows:
529 Outside a character class, the digits are read as a decimal number. If the
530 number is less than 10, or if there are that many previous extracting
531 left brackets, then it is a back reference. Otherwise, up to three octal
532 digits are read to form an escaped byte. Thus \123 is likely to be octal
533 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534 value is greater than 377, the least significant 8 bits are taken. Inside a
535 character class, \ followed by a digit is always an octal number. */
537 case '1': case '2': case '3': case '4': case '5':
538 case '6': case '7': case '8': case '9':
544 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 c = c * 10 + *(++ptr) - '0';
546 if (c < 10 || c <= bracount)
551 ptr = oldptr; /* Put the pointer back and fall through */
554 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555 generates a binary zero byte and treats the digit as a following literal.
556 Thus we have to pull back the pointer by one. */
558 if ((c = *ptr) >= '8')
565 /* \0 always starts an octal number, but we may drop through to here with a
566 larger first octal digit. */
570 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 ptr[1] != '8' && ptr[1] != '9')
572 c = c * 8 + *(++ptr) - '0';
573 c &= 255; /* Take least significant 8 bits */
576 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577 which can be greater than 0xff, but only if the ddd are hex digits. */
581 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
583 const uschar *pt = ptr + 2;
584 register int count = 0;
586 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
589 c = c * 16 + cd->lcc[*pt] -
590 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
595 if (c < 0 || count > 8) *errorptr = ERR34;
599 /* If the sequence of hex digits does not end with '}', then we don't
600 recognize this construct; fall through to the normal \x handling. */
604 /* Read just a single hex char */
607 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
610 c = c * 16 + cd->lcc[*ptr] -
611 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615 /* Other special escapes not starting with a digit are straightforward */
625 /* A letter is upper-cased; then the 0x40 bit is flipped */
627 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 for Perl compatibility, it is a literal. This code looks a bit odd, but
634 there used to be some cases other than the default, and there may be again
635 in future, so I haven't "optimized" it. */
638 if ((options & PCRE_EXTRA) != 0) switch(c)
654 /*************************************************
655 * Check for counted repeat *
656 *************************************************/
658 /* This function is called when a '{' is encountered in a place where it might
659 start a quantifier. It looks ahead to see if it really is a quantifier or not.
660 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661 where the ddds are digits.
664 p pointer to the first char after '{'
665 cd pointer to char tables block
667 Returns: TRUE or FALSE
671 is_counted_repeat(const uschar *p, compile_data *cd)
673 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 if (*p == '}') return TRUE;
677 if (*p++ != ',') return FALSE;
678 if (*p == '}') return TRUE;
680 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
687 /*************************************************
688 * Read repeat counts *
689 *************************************************/
691 /* Read an item of the form {n,m} and return the values. This is called only
692 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693 so the syntax is guaranteed to be correct, but we need to check the values.
696 p pointer to first char after '{'
697 minp pointer to int for min
698 maxp pointer to int for max
699 returned as -1 if no max
700 errorptr points to pointer to error message
701 cd pointer to character tables clock
703 Returns: pointer to '}' on success;
704 current ptr on error, with errorptr set
707 static const uschar *
708 read_repeat_counts(const uschar *p, int *minp, int *maxp,
709 const char **errorptr, compile_data *cd)
714 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
716 if (*p == '}') max = min; else
721 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
730 /* Do paranoid checks, then fill in the required variables, and pass back the
731 pointer to the terminating '}'. */
733 if (min > 65535 || max > 65535)
745 /*************************************************
746 * Find the fixed length of a pattern *
747 *************************************************/
749 /* Scan a pattern and compute the fixed length of subject that will match it,
750 if the length is fixed. This is needed for dealing with backward assertions.
753 code points to the start of the pattern (the bracket)
754 options the compiling options
756 Returns: the fixed length, or -1 if there is no fixed length
760 find_fixedlength(uschar *code, int options)
764 register int branchlength = 0;
765 register uschar *cc = code + 3;
767 /* Scan along the opcodes for this branch. If we get to the end of the
768 branch, check the length against that of the other branches. */
773 register int op = *cc;
774 if (op >= OP_BRA) op = OP_BRA;
781 d = find_fixedlength(cc, options);
782 if (d < 0) return -1;
784 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
788 /* Reached end of a branch; if it's a ket it is the end of a nested
789 call. If it's ALT it is an alternation in a nested call. If it is
790 END it's the end of the outer call. All can be handled by the same code. */
797 if (length < 0) length = branchlength;
798 else if (length != branchlength) return -1;
799 if (*cc != OP_ALT) return length;
804 /* Skip over assertive subpatterns */
809 case OP_ASSERTBACK_NOT:
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
814 /* Skip over things that don't match chars */
830 case OP_NOT_WORD_BOUNDARY:
831 case OP_WORD_BOUNDARY:
835 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836 This requires a scan of the string, unfortunately. We assume valid UTF-8
837 strings, so all we do is reduce the length by one for byte whose bits are
841 branchlength += *(++cc);
843 for (d = 1; d <= *cc; d++)
844 if ((cc[d] & 0xc0) == 0x80) branchlength--;
849 /* Handle exact repetitions */
853 branchlength += (cc[1] << 8) + cc[2];
857 /* Handle single-char matchers */
861 case OP_NOT_WHITESPACE:
863 case OP_NOT_WORDCHAR:
871 /* Check a class for variable quantification */
874 cc += (*cc == OP_REF)? 2 : 33;
886 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887 branchlength += (cc[1] << 8) + cc[2];
896 /* Anything else is variable length */
902 /* Control never gets here */
908 /*************************************************
909 * Check for POSIX class syntax *
910 *************************************************/
912 /* This function is called when the sequence "[:" or "[." or "[=" is
913 encountered in a character class. It checks whether this is followed by an
914 optional ^ and then a sequence of letters, terminated by a matching ":]" or
918 ptr pointer to the initial [
919 endptr where to return the end pointer
920 cd pointer to compile data
922 Returns: TRUE or FALSE
926 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
928 int terminator; /* Don't combine these lines; the Solaris cc */
929 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930 if (*(++ptr) == '^') ptr++;
931 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932 if (*ptr == terminator && ptr[1] == ']')
943 /*************************************************
944 * Check POSIX class name *
945 *************************************************/
947 /* This function is called to check the name given in a POSIX-style class entry
951 ptr points to the first letter
952 len the length of the name
954 Returns: a value representing the name, or -1 if unknown
958 check_posix_name(const uschar *ptr, int len)
960 register int yield = 0;
961 while (posix_name_lengths[yield] != 0)
963 if (len == posix_name_lengths[yield] &&
964 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
973 /*************************************************
974 * Compile one branch *
975 *************************************************/
977 /* Scan the pattern, compiling it into the code vector.
980 options the option bits
981 brackets points to number of brackets used
982 code points to the pointer to the current code point
983 ptrptr points to the current pattern pointer
984 errorptr points to pointer to error message
985 optchanged set to the value of the last OP_OPT item compiled
986 reqchar set to the last literal character required, else -1
987 countlits set to count of mandatory literal characters
988 cd contains pointers to tables
990 Returns: TRUE on success
991 FALSE, with *errorptr set on error
995 compile_branch(int options, int *brackets, uschar **codeptr,
996 const uschar **ptrptr, const char **errorptr, int *optchanged,
997 int *reqchar, int *countlits, compile_data *cd)
999 int repeat_type, op_type;
1000 int repeat_min, repeat_max;
1001 int bravalue, length;
1002 int greedy_default, greedy_non_default;
1005 int subcountlits = 0;
1007 register uschar *code = *codeptr;
1009 const uschar *ptr = *ptrptr;
1010 const uschar *tempptr;
1011 uschar *previous = NULL;
1014 /* Set up the default and non-default settings for greediness */
1016 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 greedy_non_default = greedy_default ^ 1;
1019 /* Initialize no required char, and count of literals */
1021 *reqchar = prevreqchar = -1;
1024 /* Switch on next character until the end of the branch */
1029 int class_charcount;
1036 if ((options & PCRE_EXTENDED) != 0)
1038 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1041 /* The space before the ; is to avoid a warning on a silly compiler
1042 on the Macintosh. */
1043 while ((c = *(++ptr)) != 0 && c != '\n') ;
1050 /* The branch terminates at end of string, |, or ). */
1059 /* Handle single-character metacharacters */
1076 /* Character classes. These always build a 32-byte bitmap of the permitted
1077 characters, except in the special case where there is only one character.
1078 For negated classes, we build the map as usual, then invert it at the end.
1085 /* If the first character is '^', set the negation flag and skip it. */
1087 if ((c = *(++ptr)) == '^')
1089 negate_class = TRUE;
1092 else negate_class = FALSE;
1094 /* Keep a count of chars so that we can optimize the case of just a single
1097 class_charcount = 0;
1098 class_lastchar = -1;
1100 /* Initialize the 32-char bit map to all zeros. We have to build the
1101 map in a temporary bit of store, in case the class contains only 1
1102 character, because in that case the compiled code doesn't use the
1105 memset(class, 0, 32 * sizeof(uschar));
1107 /* Process characters until ] is reached. By writing this as a "do" it
1108 means that an initial ] is taken as a data character. */
1118 /* Handle POSIX class names. Perl allows a negation extension of the
1119 form [:^name]. A square bracket that doesn't match the syntax is
1120 treated as a literal. We also recognize the POSIX constructions
1121 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1125 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126 check_posix_syntax(ptr, &tempptr, cd))
1128 BOOL local_negate = FALSE;
1130 register const uschar *cbits = cd->cbits;
1141 local_negate = TRUE;
1145 posix_class = check_posix_name(ptr, tempptr - ptr);
1146 if (posix_class < 0)
1152 /* If matching is caseless, upper and lower are converted to
1153 alpha. This relies on the fact that the class table starts with
1154 alpha, lower, upper as the first 3 entries. */
1156 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1159 /* Or into the map we are building up to 3 of the static class
1160 tables, or their negations. */
1163 for (i = 0; i < 3; i++)
1165 int taboffset = posix_class_maps[posix_class + i];
1166 if (taboffset < 0) break;
1168 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1170 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1174 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1178 /* Backslash may introduce a single character, or it may introduce one
1179 of the specials, which just set a flag. Escaped items are checked for
1180 validity in the pre-compiling pass. The sequence \b is a special case.
1181 Inside a class (and only there) it is treated as backspace. Elsewhere
1182 it marks a word boundary. Other escapes have preset maps ready to
1183 or into the one we are building. We assume they have more than one
1184 character in them, so set class_count bigger than one. */
1188 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 if (-c == ESC_b) c = '\b';
1192 register const uschar *cbits = cd->cbits;
1193 class_charcount = 10;
1197 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1201 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1205 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1209 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1213 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1217 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1226 /* Fall through if single character, but don't at present allow
1227 chars > 255 in UTF-8 mode. */
1238 /* A single character may be followed by '-' to form a range. However,
1239 Perl does not permit ']' to be the end of the range. A '-' character
1240 here is treated as a literal. */
1242 if (ptr[1] == '-' && ptr[2] != ']')
1254 /* The second part of a range can be a single-character escape, but
1255 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256 in such circumstances. */
1260 const uschar *oldptr = ptr;
1261 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1270 /* \b is backslash; any other special means the '-' was literal */
1274 if (d == -ESC_b) d = '\b'; else
1277 goto SINGLE_CHARACTER; /* A few lines below */
1290 class[c/8] |= (1 << (c&7));
1291 if ((options & PCRE_CASELESS) != 0)
1293 int uc = cd->fcc[c]; /* flip case */
1294 class[uc/8] |= (1 << (uc&7));
1296 class_charcount++; /* in case a one-char range */
1299 continue; /* Go get the next char in the class */
1302 /* Handle a lone single character - we can get here for a normal
1303 non-escape char, or after \ that introduces a single character. */
1307 class [c/8] |= (1 << (c&7));
1308 if ((options & PCRE_CASELESS) != 0)
1310 c = cd->fcc[c]; /* flip case */
1311 class[c/8] |= (1 << (c&7));
1317 /* Loop until ']' reached; the check for end of string happens inside the
1318 loop. This "while" is the end of the "do" above. */
1320 while ((c = *(++ptr)) != ']');
1322 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323 precisely one character. This doesn't need the whole 32-byte bit map.
1324 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1327 if (class_charcount == 1 && class_lastchar >= 0)
1335 code[-1] = OP_CHARS;
1338 *code++ = class_lastchar;
1341 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1347 for (c = 0; c < 32; c++) code[c] = ~class[c];
1349 memcpy(code, class, 32);
1354 /* Various kinds of repeat */
1357 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 if (*errorptr != NULL) goto FAILED;
1377 if (previous == NULL)
1383 /* If the next character is '?' this is a minimizing repeat, by default,
1384 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1388 { repeat_type = greedy_non_default; ptr++; }
1389 else repeat_type = greedy_default;
1391 /* If previous was a string of characters, chop off the last one and use it
1392 as the subject of the repeat. If there was only one character, we can
1393 abolish the previous item altogether. A repeat with a zero minimum wipes
1394 out any reqchar setting, backing up to the previous value. We must also
1395 adjust the countlits value. */
1397 if (*previous == OP_CHARS)
1399 int len = previous[1];
1401 if (repeat_min == 0) *reqchar = prevreqchar;
1402 *countlits += repeat_min - 1;
1411 c = previous[len+1];
1415 op_type = 0; /* Use single-char op codes */
1416 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1419 /* If previous was a single negated character ([^a] or similar), we use
1420 one of the special opcodes, replacing it. The code is shared with single-
1421 character repeats by adding a suitable offset into repeat_type. */
1423 else if ((int)*previous == OP_NOT)
1425 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1428 goto OUTPUT_SINGLE_REPEAT;
1431 /* If previous was a character type match (\d or similar), abolish it and
1432 create a suitable repeat item. The code is shared with single-character
1433 repeats by adding a suitable offset into repeat_type. */
1435 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1437 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1441 OUTPUT_SINGLE_REPEAT:
1443 /* If the maximum is zero then the minimum must also be zero; Perl allows
1444 this case, so we do too - by simply omitting the item altogether. */
1446 if (repeat_max == 0) goto END_REPEAT;
1448 /* Combine the op_type with the repeat_type */
1450 repeat_type += op_type;
1452 /* A minimum of zero is handled either as the special case * or ?, or as
1453 an UPTO, with the maximum given. */
1455 if (repeat_min == 0)
1457 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1461 *code++ = OP_UPTO + repeat_type;
1462 *code++ = repeat_max >> 8;
1463 *code++ = (repeat_max & 255);
1467 /* The case {1,} is handled as the special case + */
1469 else if (repeat_min == 1 && repeat_max == -1)
1470 *code++ = OP_PLUS + repeat_type;
1472 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1477 if (repeat_min != 1)
1479 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480 *code++ = repeat_min >> 8;
1481 *code++ = (repeat_min & 255);
1484 /* If the mininum is 1 and the previous item was a character string,
1485 we either have to put back the item that got cancelled if the string
1486 length was 1, or add the character back onto the end of a longer
1487 string. For a character type nothing need be done; it will just get
1488 put back naturally. Note that the final character is always going to
1491 else if (*previous == OP_CHARS)
1493 if (code == previous) code += 2; else previous[1]++;
1496 /* For a single negated character we also have to put back the
1497 item that got cancelled. */
1499 else if (*previous == OP_NOT) code++;
1501 /* If the maximum is unlimited, insert an OP_STAR. */
1506 *code++ = OP_STAR + repeat_type;
1509 /* Else insert an UPTO if the max is greater than the min. */
1511 else if (repeat_max != repeat_min)
1514 repeat_max -= repeat_min;
1515 *code++ = OP_UPTO + repeat_type;
1516 *code++ = repeat_max >> 8;
1517 *code++ = (repeat_max & 255);
1521 /* The character or character type itself comes last in all cases. */
1526 /* If previous was a character class or a back reference, we put the repeat
1527 stuff after it, but just skip the item if the repeat was {0,0}. */
1529 else if (*previous == OP_CLASS || *previous == OP_REF)
1531 if (repeat_max == 0)
1536 if (repeat_min == 0 && repeat_max == -1)
1537 *code++ = OP_CRSTAR + repeat_type;
1538 else if (repeat_min == 1 && repeat_max == -1)
1539 *code++ = OP_CRPLUS + repeat_type;
1540 else if (repeat_min == 0 && repeat_max == 1)
1541 *code++ = OP_CRQUERY + repeat_type;
1544 *code++ = OP_CRRANGE + repeat_type;
1545 *code++ = repeat_min >> 8;
1546 *code++ = repeat_min & 255;
1547 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548 *code++ = repeat_max >> 8;
1549 *code++ = repeat_max & 255;
1553 /* If previous was a bracket group, we may have to replicate it in certain
1556 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557 (int)*previous == OP_COND)
1561 int len = code - previous;
1562 uschar *bralink = NULL;
1564 /* If the maximum repeat count is unlimited, find the end of the bracket
1565 by scanning through from the start, and compute the offset back to it
1566 from the current code pointer. There may be an OP_OPT setting following
1567 the final KET, so we can't find the end just by going back from the code
1570 if (repeat_max == -1)
1572 register uschar *ket = previous;
1573 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574 ketoffset = code - ket;
1577 /* The case of a zero minimum is special because of the need to stick
1578 OP_BRAZERO in front of it, and because the group appears once in the
1579 data, whereas in other cases it appears the minimum number of times. For
1580 this reason, it is simplest to treat this case separately, as otherwise
1581 the code gets far too mess. There are several special subcases when the
1584 if (repeat_min == 0)
1586 /* If we set up a required char from the bracket, we must back off
1587 to the previous value and reset the countlits value too. */
1589 if (subcountlits > 0)
1591 *reqchar = prevreqchar;
1592 *countlits -= subcountlits;
1595 /* If the maximum is also zero, we just omit the group from the output
1598 if (repeat_max == 0)
1604 /* If the maximum is 1 or unlimited, we just have to stick in the
1605 BRAZERO and do no more at this point. */
1607 if (repeat_max <= 1)
1609 memmove(previous+1, previous, len);
1611 *previous++ = OP_BRAZERO + repeat_type;
1614 /* If the maximum is greater than 1 and limited, we have to replicate
1615 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616 The first one has to be handled carefully because it's the original
1617 copy, which has to be moved up. The remainder can be handled by code
1618 that is common with the non-zero minimum case below. We just have to
1619 adjust the value or repeat_max, since one less copy is required. */
1624 memmove(previous+4, previous, len);
1626 *previous++ = OP_BRAZERO + repeat_type;
1627 *previous++ = OP_BRA;
1629 /* We chain together the bracket offset fields that have to be
1630 filled in later when the ends of the brackets are reached. */
1632 offset = (bralink == NULL)? 0 : previous - bralink;
1634 *previous++ = offset >> 8;
1635 *previous++ = offset & 255;
1641 /* If the minimum is greater than zero, replicate the group as many
1642 times as necessary, and adjust the maximum to the number of subsequent
1643 copies that we need. */
1647 for (i = 1; i < repeat_min; i++)
1649 memcpy(code, previous, len);
1652 if (repeat_max > 0) repeat_max -= repeat_min;
1655 /* This code is common to both the zero and non-zero minimum cases. If
1656 the maximum is limited, it replicates the group in a nested fashion,
1657 remembering the bracket starts on a stack. In the case of a zero minimum,
1658 the first one was set up above. In all cases the repeat_max now specifies
1659 the number of additional copies needed. */
1661 if (repeat_max >= 0)
1663 for (i = repeat_max - 1; i >= 0; i--)
1665 *code++ = OP_BRAZERO + repeat_type;
1667 /* All but the final copy start a new nesting, maintaining the
1668 chain of brackets outstanding. */
1674 offset = (bralink == NULL)? 0 : code - bralink;
1676 *code++ = offset >> 8;
1677 *code++ = offset & 255;
1680 memcpy(code, previous, len);
1684 /* Now chain through the pending brackets, and fill in their length
1685 fields (which are holding the chain links pro tem). */
1687 while (bralink != NULL)
1690 int offset = code - bralink + 1;
1691 uschar *bra = code - offset;
1692 oldlinkoffset = (bra[1] << 8) + bra[2];
1693 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1695 *code++ = bra[1] = offset >> 8;
1696 *code++ = bra[2] = (offset & 255);
1700 /* If the maximum is unlimited, set a repeater in the final copy. We
1701 can't just offset backwards from the current code point, because we
1702 don't know if there's been an options resetting after the ket. The
1703 correct offset was computed above. */
1705 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1708 /* Else there's some kind of shambles */
1716 /* In all case we no longer have a previous item. */
1723 /* Start of nested bracket sub-expression, or comment or lookahead or
1724 lookbehind or option setting or condition. First deal with special things
1725 that can come after a bracket; all are introduced by ?, and the appearance
1726 of any of them means that this is not a referencing group. They were
1727 checked for validity in the first pass over the string, so we don't have to
1728 check for syntax errors here. */
1731 newoptions = options;
1734 if (*(++ptr) == '?')
1741 case '#': /* Comment; skip to ket */
1743 while (*ptr != ')') ptr++;
1746 case ':': /* Non-extracting bracket */
1752 bravalue = OP_COND; /* Conditional group */
1753 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1755 condref = *ptr - '0';
1756 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1767 case '=': /* Positive lookahead */
1768 bravalue = OP_ASSERT;
1772 case '!': /* Negative lookahead */
1773 bravalue = OP_ASSERT_NOT;
1777 case '<': /* Lookbehinds */
1780 case '=': /* Positive lookbehind */
1781 bravalue = OP_ASSERTBACK;
1785 case '!': /* Negative lookbehind */
1786 bravalue = OP_ASSERTBACK_NOT;
1790 default: /* Syntax error */
1796 case '>': /* One-time brackets */
1801 case 'R': /* Pattern recursion */
1802 *code++ = OP_RECURSE;
1806 default: /* Option setting */
1810 while (*ptr != ')' && *ptr != ':')
1814 case '-': optset = &unset; break;
1816 case 'i': *optset |= PCRE_CASELESS; break;
1817 case 'm': *optset |= PCRE_MULTILINE; break;
1818 case 's': *optset |= PCRE_DOTALL; break;
1819 case 'x': *optset |= PCRE_EXTENDED; break;
1820 case 'U': *optset |= PCRE_UNGREEDY; break;
1821 case 'X': *optset |= PCRE_EXTRA; break;
1829 /* Set up the changed option bits, but don't change anything yet. */
1831 newoptions = (options | set) & (~unset);
1833 /* If the options ended with ')' this is not the start of a nested
1834 group with option changes, so the options change at this level. At top
1835 level there is nothing else to be done (the options will in fact have
1836 been set from the start of compiling as a result of the first pass) but
1837 at an inner level we must compile code to change the ims options if
1838 necessary, and pass the new setting back so that it can be put at the
1839 start of any following branches, and when this group ends, a resetting
1840 item can be compiled. */
1844 if ((options & PCRE_INGROUP) != 0 &&
1845 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1848 *code++ = *optchanged = newoptions & PCRE_IMS;
1850 options = newoptions; /* Change options at this level */
1851 previous = NULL; /* This item can't be repeated */
1852 continue; /* It is complete */
1855 /* If the options ended with ':' we are heading into a nested group
1856 with possible change of options. Such groups are non-capturing and are
1857 not assertions of any kind. All we need to do is skip over the ':';
1858 the newoptions value is handled below. */
1865 /* Else we have a referencing group; adjust the opcode. */
1869 if (++(*brackets) > EXTRACT_MAX)
1874 bravalue = OP_BRA + *brackets;
1877 /* Process nested bracketed re. Assertions may not be repeated, but other
1878 kinds can be. We copy code into a non-register variable in order to be able
1879 to pass its address because some compilers complain otherwise. Pass in a
1880 new setting for the ims options if they have changed. */
1882 previous = (bravalue >= OP_ONCE)? code : NULL;
1887 options | PCRE_INGROUP, /* Set for all nested groups */
1888 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890 brackets, /* Bracket level */
1891 &tempcode, /* Where to put code (updated) */
1892 &ptr, /* Input pointer (updated) */
1893 errorptr, /* Where to put an error message */
1894 (bravalue == OP_ASSERTBACK ||
1895 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896 condref, /* Condition reference number */
1897 &subreqchar, /* For possible last char */
1898 &subcountlits, /* For literal count */
1899 cd)) /* Tables block */
1902 /* At the end of compiling, code is still pointing to the start of the
1903 group, while tempcode has been updated to point past the end of the group
1904 and any option resetting that may follow it. The pattern pointer (ptr)
1905 is on the bracket. */
1907 /* If this is a conditional bracket, check that there are no more than
1908 two branches in the group. */
1910 if (bravalue == OP_COND)
1917 tc += (tc[1] << 8) | tc[2];
1919 while (*tc != OP_KET);
1928 /* Handle updating of the required character. If the subpattern didn't
1929 set one, leave it as it was. Otherwise, update it for normal brackets of
1930 all kinds, forward assertions, and conditions with two branches. Don't
1931 update the literal count for forward assertions, however. If the bracket
1932 is followed by a quantifier with zero repeat, we have to back off. Hence
1933 the definition of prevreqchar and subcountlits outside the main loop so
1934 that they can be accessed for the back off. */
1936 if (subreqchar > 0 &&
1937 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938 (bravalue == OP_COND && condcount == 2)))
1940 prevreqchar = *reqchar;
1941 *reqchar = subreqchar;
1942 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1945 /* Now update the main code pointer to the end of the group. */
1949 /* Error if hit end of pattern */
1958 /* Check \ for being a real metacharacter; if not, fall through and handle
1959 it as a data character at the start of a string. Escape items are checked
1960 for validity in the pre-compiling pass. */
1964 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1966 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967 are arranged to be the negation of the corresponding OP_values. For the
1968 back references, the values are ESC_REF plus the reference number. Only
1969 back references and those types that consume a character may be repeated.
1970 We can test for values between ESC_b and ESC_Z for the latter; this may
1971 have to change if any new ones are ever created. */
1979 *code++ = -c - ESC_REF;
1983 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1989 /* Data character: reset and fall through */
1994 /* Handle a run of data characters until a metacharacter is encountered.
1995 The first character is guaranteed not to be whitespace or # when the
1996 extended flag is set. */
2007 if ((options & PCRE_EXTENDED) != 0)
2009 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2012 /* The space before the ; is to avoid a warning on a silly compiler
2013 on the Macintosh. */
2014 while ((c = *(++ptr)) != 0 && c != '\n') ;
2020 /* Backslash may introduce a data char or a metacharacter. Escaped items
2021 are checked for validity in the pre-compiling pass. Stop the string
2022 before a metaitem. */
2027 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028 if (c < 0) { ptr = tempptr; break; }
2030 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031 two or more characters in the UTF-8 encoding. */
2034 if (c > 127 && (options & PCRE_UTF8) != 0)
2037 int len = ord2utf8(c, buffer);
2038 for (c = 0; c < len; c++) *code++ = buffer[c];
2045 /* Ordinary character or single-char escape */
2051 /* This "while" is the end of the "do" above. */
2053 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2055 /* Update the last character and the count of literals */
2057 prevreqchar = (length > 1)? code[-2] : *reqchar;
2058 *reqchar = code[-1];
2059 *countlits += length;
2061 /* Compute the length and set it in the data vector, and advance to
2064 previous[1] = length;
2065 if (length < MAXLIT) ptr--;
2068 } /* end of big loop */
2070 /* Control never reaches here by falling through, only by a goto for all the
2071 error states. Pass back the position in the pattern so that it can be displayed
2072 to the user for diagnosing the error. */
2082 /*************************************************
2083 * Compile sequence of alternatives *
2084 *************************************************/
2086 /* On entry, ptr is pointing past the bracket character, but on return
2087 it points to the closing bracket, or vertical bar, or end of string.
2088 The code variable is pointing at the byte into which the BRA operator has been
2089 stored. If the ims options are changed at the start (for a (?ims: group) or
2090 during any branch, we need to insert an OP_OPT item at the start of every
2091 following branch to ensure they get set correctly at run time, and also pass
2092 the new options into every subsequent branch compile.
2095 options the option bits
2096 optchanged new ims options to set as if (?ims) were at the start, or -1
2098 brackets -> int containing the number of extracting brackets used
2099 codeptr -> the address of the current code pointer
2100 ptrptr -> the address of the current pattern pointer
2101 errorptr -> pointer to error message
2102 lookbehind TRUE if this is a lookbehind assertion
2103 condref >= 0 for OPT_CREF setting at start of conditional group
2104 reqchar -> place to put the last required character, or a negative number
2105 countlits -> place to put the shortest literal count of any branch
2106 cd points to the data block with tables pointers
2108 Returns: TRUE on success
2112 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114 int *reqchar, int *countlits, compile_data *cd)
2116 const uschar *ptr = *ptrptr;
2117 uschar *code = *codeptr;
2118 uschar *last_branch = code;
2119 uschar *start_bracket = code;
2120 uschar *reverse_count = NULL;
2121 int oldoptions = options & PCRE_IMS;
2122 int branchreqchar, branchcountlits;
2125 *countlits = INT_MAX;
2128 /* At the start of a reference-based conditional group, insert the reference
2129 number as an OP_CREF item. */
2137 /* Loop for each alternative branch */
2143 /* Handle change of options */
2145 if (optchanged >= 0)
2148 *code++ = optchanged;
2149 options = (options & ~PCRE_IMS) | optchanged;
2152 /* Set up dummy OP_REVERSE if lookbehind assertion */
2156 *code++ = OP_REVERSE;
2157 reverse_count = code;
2162 /* Now compile the branch */
2164 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165 &branchreqchar, &branchcountlits, cd))
2171 /* Fill in the length of the last branch */
2173 length = code - last_branch;
2174 last_branch[1] = length >> 8;
2175 last_branch[2] = length & 255;
2177 /* Save the last required character if all branches have the same; a current
2178 value of -1 means unset, while -2 means "previous branch had no last required
2183 if (branchreqchar >= 0)
2185 if (*reqchar == -1) *reqchar = branchreqchar;
2186 else if (*reqchar != branchreqchar) *reqchar = -2;
2191 /* Keep the shortest literal count */
2193 if (branchcountlits < *countlits) *countlits = branchcountlits;
2194 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2196 /* If lookbehind, check that this branch matches a fixed-length string,
2197 and put the length into the OP_REVERSE item. Temporarily mark the end of
2198 the branch with OP_END. */
2203 length = find_fixedlength(last_branch, options);
2204 DPRINTF(("fixed length = %d\n", length));
2211 reverse_count[0] = (length >> 8);
2212 reverse_count[1] = length & 255;
2215 /* Reached end of expression, either ')' or end of pattern. Insert a
2216 terminating ket and the length of the whole bracketed item, and return,
2217 leaving the pointer at the terminating char. If any of the ims options
2218 were changed inside the group, compile a resetting op-code following. */
2222 length = code - start_bracket;
2224 *code++ = length >> 8;
2225 *code++ = length & 255;
2226 if (optchanged >= 0)
2229 *code++ = oldoptions;
2236 /* Another branch follows; insert an "or" node and advance the pointer. */
2243 /* Control never reaches here */
2249 /*************************************************
2250 * Find first significant op code *
2251 *************************************************/
2253 /* This is called by several functions that scan a compiled expression looking
2254 for a fixed first character, or an anchoring op code etc. It skips over things
2255 that do not influence this. For one application, a change of caseless option is
2259 code pointer to the start of the group
2260 options pointer to external options
2261 optbit the option bit whose changing is significant, or
2263 optstop TRUE to return on option change, otherwise change the options
2266 Returns: pointer to the first significant opcode
2269 static const uschar*
2270 first_significant_code(const uschar *code, int *options, int optbit,
2278 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2280 if (optstop) return code;
2281 *options = (int)code[1];
2290 case OP_WORD_BOUNDARY:
2291 case OP_NOT_WORD_BOUNDARY:
2297 case OP_ASSERTBACK_NOT:
2298 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306 /* Control never reaches here */
2312 /*************************************************
2313 * Check for anchored expression *
2314 *************************************************/
2316 /* Try to find out if this is an anchored regular expression. Consider each
2317 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320 counts, since OP_CIRC can match in the middle.
2322 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323 because that will try the rest of the pattern at all possible matching points,
2324 so there is no point trying them again.
2327 code points to start of expression (the bracket)
2328 options points to the options setting
2330 Returns: TRUE or FALSE
2334 is_anchored(register const uschar *code, int *options)
2337 const uschar *scode = first_significant_code(code + 3, options,
2338 PCRE_MULTILINE, FALSE);
2339 register int op = *scode;
2340 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341 { if (!is_anchored(scode, options)) return FALSE; }
2342 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343 (*options & PCRE_DOTALL) != 0)
2344 { if (scode[1] != OP_ANY) return FALSE; }
2345 else if (op != OP_SOD &&
2346 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2348 code += (code[1] << 8) + code[2];
2350 while (*code == OP_ALT);
2356 /*************************************************
2357 * Check for starting with ^ or .* *
2358 *************************************************/
2360 /* This is called to find out if every branch starts with ^ or .* so that
2361 "first char" processing can be done to speed things up in multiline
2362 matching and for non-DOTALL patterns that start with .* (which must start at
2363 the beginning or after \n).
2365 Argument: points to start of expression (the bracket)
2366 Returns: TRUE or FALSE
2370 is_startline(const uschar *code)
2373 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374 register int op = *scode;
2375 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376 { if (!is_startline(scode)) return FALSE; }
2377 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378 { if (scode[1] != OP_ANY) return FALSE; }
2379 else if (op != OP_CIRC) return FALSE;
2380 code += (code[1] << 8) + code[2];
2382 while (*code == OP_ALT);
2388 /*************************************************
2389 * Check for fixed first char *
2390 *************************************************/
2392 /* Try to find out if there is a fixed first character. This is called for
2393 unanchored expressions, as it speeds up their processing quite considerably.
2394 Consider each alternative branch. If they all start with the same char, or with
2395 a bracket all of whose alternatives start with the same char (recurse ad lib),
2396 then we return that char, otherwise -1.
2399 code points to start of expression (the bracket)
2400 options pointer to the options (used to check casing changes)
2402 Returns: -1 or the fixed first char
2406 find_firstchar(const uschar *code, int *options)
2408 register int c = -1;
2411 const uschar *scode = first_significant_code(code + 3, options,
2412 PCRE_CASELESS, TRUE);
2413 register int op = *scode;
2415 if (op >= OP_BRA) op = OP_BRA;
2426 if ((d = find_firstchar(scode, options)) < 0) return -1;
2427 if (c < 0) c = d; else if (c != d) return -1;
2430 case OP_EXACT: /* Fall through */
2433 case OP_CHARS: /* Fall through */
2438 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2442 code += (code[1] << 8) + code[2];
2444 while (*code == OP_ALT);
2452 /*************************************************
2453 * Compile a Regular Expression *
2454 *************************************************/
2456 /* This function takes a string and returns a pointer to a block of store
2457 holding a compiled version of the expression.
2460 pattern the regular expression
2461 options various option bits
2462 errorptr pointer to pointer to error text
2463 erroroffset ptr offset in pattern where error was detected
2464 tables pointer to character tables or NULL
2466 Returns: pointer to compiled data block, or NULL on error,
2467 with errorptr and erroroffset set
2471 pcre_compile(const char *pattern, int options, const char **errorptr,
2472 int *erroroffset, const unsigned char *tables)
2475 int length = 3; /* For initial BRA plus length */
2477 int c, reqchar, countlits;
2479 int top_backref = 0;
2480 int branch_extra = 0;
2481 int branch_newextra;
2482 unsigned int brastackptr = 0;
2486 compile_data compile_block;
2487 int brastack[BRASTACK_SIZE];
2488 uschar bralenstack[BRASTACK_SIZE];
2491 uschar *code_base, *code_end;
2494 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2496 #ifndef SUPPORT_UTF8
2497 if ((options & PCRE_UTF8) != 0)
2504 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505 can do is just return NULL. */
2507 if (errorptr == NULL) return NULL;
2510 /* However, we can give a message for this error */
2512 if (erroroffset == NULL)
2519 if ((options & ~PUBLIC_OPTIONS) != 0)
2525 /* Set up pointers to the individual character tables */
2527 if (tables == NULL) tables = pcre_default_tables;
2528 compile_block.lcc = tables + lcc_offset;
2529 compile_block.fcc = tables + fcc_offset;
2530 compile_block.cbits = tables + cbits_offset;
2531 compile_block.ctypes = tables + ctypes_offset;
2533 /* Reflect pattern for debugging output */
2535 DPRINTF(("------------------------------------------------------------------\n"));
2536 DPRINTF(("%s\n", pattern));
2538 /* The first thing to do is to make a pass over the pattern to compute the
2539 amount of store required to hold the compiled code. This does not have to be
2540 perfect as long as errors are overestimates. At the same time we can detect any
2541 internal flag settings. Make an attempt to correct for any counted white space
2542 if an "extended" flag setting appears late in the pattern. We can't be so
2543 clever for #-comments. */
2545 ptr = (const uschar *)(pattern - 1);
2546 while ((c = *(++ptr)) != 0)
2549 int class_charcount;
2551 if ((options & PCRE_EXTENDED) != 0)
2553 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2556 /* The space before the ; is to avoid a warning on a silly compiler
2557 on the Macintosh. */
2558 while ((c = *(++ptr)) != 0 && c != '\n') ;
2565 /* A backslashed item may be an escaped "normal" character or a
2566 character type. For a "normal" character, put the pointers and
2567 character back so that tests for whitespace etc. in the input
2568 are done correctly. */
2572 const uschar *save_ptr = ptr;
2573 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2584 /* A back reference needs an additional char, plus either one or 5
2585 bytes for a repeat. We also need to keep the value of the highest
2590 int refnum = -c - ESC_REF;
2591 if (refnum > top_backref) top_backref = refnum;
2592 length++; /* For single back reference */
2593 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2595 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597 if ((min == 0 && (max == 1 || max == -1)) ||
2598 (min == 1 && max == -1))
2601 if (ptr[1] == '?') ptr++;
2609 case '*': /* These repeats won't be after brackets; */
2610 case '+': /* those are handled separately */
2615 /* This covers the cases of repeats after a single char, metachar, class,
2616 or back reference. */
2619 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622 if ((min == 0 && (max == 1 || max == -1)) ||
2623 (min == 1 && max == -1))
2627 length--; /* Uncount the original char or metachar */
2628 if (min == 1) length++; else if (min > 0) length += 4;
2629 if (max > 0) length += 4; else length += 2;
2631 if (ptr[1] == '?') ptr++;
2634 /* An alternation contains an offset to the next branch or ket. If any ims
2635 options changed in the previous branch(es), and/or if we are in a
2636 lookbehind assertion, extra space will be needed at the start of the
2637 branch. This is handled by branch_extra. */
2640 length += 3 + branch_extra;
2643 /* A character class uses 33 characters. Don't worry about character types
2644 that aren't allowed in classes - they'll get picked up during the compile.
2645 A character class that contains only one character uses 2 or 3 bytes,
2646 depending on whether it is negated or not. Notice this where we can. */
2649 class_charcount = 0;
2650 if (*(++ptr) == '^') ptr++;
2655 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2657 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2660 else class_charcount++;
2663 while (*ptr != 0 && *ptr != ']');
2665 /* Repeats for negated single chars are handled by the general code */
2667 if (class_charcount == 1) length += 3; else
2671 /* A repeat needs either 1 or 5 bytes. */
2673 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2675 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677 if ((min == 0 && (max == 1 || max == -1)) ||
2678 (min == 1 && max == -1))
2681 if (ptr[1] == '?') ptr++;
2686 /* Brackets may be genuine groups or special things */
2689 branch_newextra = 0;
2691 /* Handle special forms of bracket, which all start (? */
2700 /* Skip over comments entirely */
2703 while (*ptr != 0 && *ptr != ')') ptr++;
2707 goto PCRE_ERROR_RETURN;
2711 /* Non-referencing groups and lookaheads just move the pointer on, and
2712 then behave like a non-special bracket, except that they don't increment
2713 the count of extracting brackets. Ditto for the "once only" bracket,
2714 which is in Perl from version 5.005. */
2723 /* A recursive call to the regex is an extension, to provide the
2724 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2730 goto PCRE_ERROR_RETURN;
2736 /* Lookbehinds are in Perl from version 5.005 */
2739 if (ptr[3] == '=' || ptr[3] == '!')
2742 branch_newextra = 3;
2743 length += 3; /* For the first branch */
2747 goto PCRE_ERROR_RETURN;
2749 /* Conditionals are in Perl from version 5.005. The bracket must either
2750 be followed by a number (for bracket reference) or by an assertion
2754 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2758 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2762 goto PCRE_ERROR_RETURN;
2765 else /* An assertion must follow */
2767 ptr++; /* Can treat like ':' as far as spacing is concerned */
2768 if (ptr[2] != '?' ||
2769 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2771 ptr += 2; /* To get right offset in message */
2773 goto PCRE_ERROR_RETURN;
2778 /* Else loop checking valid options until ) is met. Anything else is an
2779 error. If we are without any brackets, i.e. at top level, the settings
2780 act as if specified in the options, so massage the options immediately.
2781 This is for backward compatibility with Perl 5.004. */
2794 *optset |= PCRE_CASELESS;
2798 *optset |= PCRE_MULTILINE;
2802 *optset |= PCRE_DOTALL;
2806 *optset |= PCRE_EXTENDED;
2810 *optset |= PCRE_EXTRA;
2814 *optset |= PCRE_UNGREEDY;
2821 /* A termination by ')' indicates an options-setting-only item;
2822 this is global at top level; otherwise nothing is done here and
2823 it is handled during the compiling process on a per-bracket-group
2827 if (brastackptr == 0)
2829 options = (options | set) & (~unset);
2830 set = unset = 0; /* To save length */
2834 /* A termination by ':' indicates the start of a nested group with
2835 the given options set. This is again handled at compile time, but
2836 we must allow for compiled space if any of the ims options are
2837 set. We also have to allow for resetting space at the end of
2838 the group, which is why 4 is added to the length and not just 2.
2839 If there are several changes of options within the same group, this
2840 will lead to an over-estimate on the length, but this shouldn't
2841 matter very much. We also have to allow for resetting options at
2842 the start of any alternations, which we do by setting
2843 branch_newextra to 2. Finally, we record whether the case-dependent
2844 flag ever changes within the regex. This is used by the "required
2848 if (((set|unset) & PCRE_IMS) != 0)
2851 branch_newextra = 2;
2852 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2856 /* Unrecognized option character */
2860 goto PCRE_ERROR_RETURN;
2864 /* If we hit a closing bracket, that's it - this is a freestanding
2865 option-setting. We need to ensure that branch_extra is updated if
2866 necessary. The only values branch_newextra can have here are 0 or 2.
2867 If the value is 2, then branch_extra must either be 2 or 5, depending
2868 on whether this is a lookbehind group or not. */
2873 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2874 branch_extra += branch_newextra;
2878 /* If options were terminated by ':' control comes here. Fall through
2879 to handle the group below. */
2883 /* Extracting brackets must be counted so we can process escapes in a
2888 /* Non-special forms of bracket. Save length for computing whole length
2889 at end if there's a repeat that requires duplication of the group. Also
2890 save the current value of branch_extra, and start the new group with
2891 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2892 for a lookbehind assertion. */
2894 if (brastackptr >= sizeof(brastack)/sizeof(int))
2897 goto PCRE_ERROR_RETURN;
2900 bralenstack[brastackptr] = branch_extra;
2901 branch_extra = branch_newextra;
2903 brastack[brastackptr++] = length;
2907 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2908 have to replicate this bracket up to that many times. If brastackptr is
2909 0 this is an unmatched bracket which will generate an error, but take care
2910 not to try to access brastack[-1] when computing the length and restoring
2911 the branch_extra value. */
2920 if (brastackptr > 0)
2922 duplength = length - brastack[--brastackptr];
2923 branch_extra = bralenstack[brastackptr];
2927 /* Leave ptr at the final char; for read_repeat_counts this happens
2928 automatically; for the others we need an increment. */
2930 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2932 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2934 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2936 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2937 else if (c == '+') { maxval = -1; ptr++; }
2938 else if (c == '?') { minval = 0; ptr++; }
2940 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2941 group, and if the maximum is greater than zero, we have to replicate
2942 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2943 bracket set - hence the 7. */
2948 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2951 /* When the minimum is greater than zero, 1 we have to replicate up to
2952 minval-1 times, with no additions required in the copies. Then, if
2953 there is a limited maximum we have to replicate up to maxval-1 times
2954 allowing for a BRAZERO item before each optional copy and nesting
2955 brackets for all but one of the optional copies. */
2959 length += (minval - 1) * duplength;
2960 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2961 length += (maxval - minval) * (duplength + 7) - 6;
2966 /* Non-special character. For a run of such characters the length required
2967 is the number of characters + 2, except that the maximum run length is 255.
2968 We won't get a skipped space or a non-data escape or the start of a #
2969 comment as the first character, so the length can't be zero. */
2977 if ((options & PCRE_EXTENDED) != 0)
2979 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2982 /* The space before the ; is to avoid a warning on a silly compiler
2983 on the Macintosh. */
2984 while ((c = *(++ptr)) != 0 && c != '\n') ;
2989 /* Backslash may introduce a data char or a metacharacter; stop the
2990 string before the latter. */
2994 const uschar *saveptr = ptr;
2995 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2997 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998 if (c < 0) { ptr = saveptr; break; }
3001 if (c > 127 && (options & PCRE_UTF8) != 0)
3004 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005 if (c <= utf8_table1[i]) break;
3011 /* Ordinary character or single-char escape */
3016 /* This "while" is the end of the "do" above. */
3018 while (runlength < MAXLIT &&
3019 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3022 length += runlength;
3027 length += 4; /* For final KET and END */
3035 /* Compute the size of data block needed and get it, either from malloc or
3036 externally provided function. We specify "code[0]" in the offsetof() expression
3037 rather than just "code", because it has been reported that one broken compiler
3038 fails on "code" because it is also an independent variable. It should make no
3039 difference to the value of the offsetof(). */
3041 size = length + offsetof(real_pcre, code[0]);
3042 re = (real_pcre *)(pcre_malloc)(size);
3050 /* Put in the magic number, and save the size, options, and table pointer */
3052 re->magic_number = MAGIC_NUMBER;
3054 re->options = options;
3055 re->tables = tables;
3057 /* Set up a starting, non-extracting bracket, then compile the expression. On
3058 error, *errorptr will be set non-NULL, so we don't need to look at the result
3059 of the function here. */
3061 ptr = (const uschar *)pattern;
3065 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3066 &reqchar, &countlits, &compile_block);
3067 re->top_bracket = bracount;
3068 re->top_backref = top_backref;
3070 /* If not reached end of pattern on success, there's an excess bracket. */
3072 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3074 /* Fill in the terminating state and check for disastrous overflow, but
3075 if debugging, leave the test till after things are printed out. */
3080 if (code - re->code > length) *errorptr = ERR23;
3083 /* Give an error if there's back reference to a non-existent capturing
3086 if (top_backref > re->top_bracket) *errorptr = ERR15;
3088 /* Failed to compile */
3090 if (*errorptr != NULL)
3094 *erroroffset = ptr - (const uschar *)pattern;
3098 /* If the anchored option was not passed, set flag if we can determine that the
3099 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3100 starting with .* when DOTALL is set).
3102 Otherwise, see if we can determine what the first character has to be, because
3103 that speeds up unanchored matches no end. If not, see if we can set the
3104 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3105 start with ^. and also when all branches start with .* for non-DOTALL matches.
3108 if ((options & PCRE_ANCHORED) == 0)
3110 int temp_options = options;
3111 if (is_anchored(re->code, &temp_options))
3112 re->options |= PCRE_ANCHORED;
3115 int ch = find_firstchar(re->code, &temp_options);
3118 re->first_char = ch;
3119 re->options |= PCRE_FIRSTSET;
3121 else if (is_startline(re->code))
3122 re->options |= PCRE_STARTLINE;
3126 /* Save the last required character if there are at least two literal
3127 characters on all paths, or if there is no first character setting. */
3129 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3131 re->req_char = reqchar;
3132 re->options |= PCRE_REQCHSET;
3135 /* Print out the compiled data for debugging */
3139 printf("Length = %d top_bracket = %d top_backref = %d\n",
3140 length, re->top_bracket, re->top_backref);
3142 if (re->options != 0)
3144 printf("%s%s%s%s%s%s%s%s%s\n",
3145 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3146 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3147 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3148 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3149 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3150 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3151 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3152 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3153 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3156 if ((re->options & PCRE_FIRSTSET) != 0)
3158 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3159 else printf("First char = \\x%02x\n", re->first_char);
3162 if ((re->options & PCRE_REQCHSET) != 0)
3164 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3165 else printf("Req char = \\x%02x\n", re->req_char);
3169 code_base = code = re->code;
3171 while (code < code_end)
3175 printf("%3d ", code - code_base);
3177 if (*code >= OP_BRA)
3179 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3186 printf(" %.2x %s", code[1], OP_names[*code]);
3191 printf("%3d Cond", (code[1] << 8) + code[2]);
3196 printf(" %.2d %s", code[1], OP_names[*code]);
3201 charlength = *(++code);
3202 printf("%3d ", charlength);
3203 while (charlength-- > 0)
3204 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3214 case OP_ASSERTBACK_NOT:
3216 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3221 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3232 case OP_TYPEMINSTAR:
3234 case OP_TYPEMINPLUS:
3236 case OP_TYPEMINQUERY:
3237 if (*code >= OP_TYPESTAR)
3238 printf(" %s", OP_names[code[1]]);
3239 else if (isprint(c = code[1])) printf(" %c", c);
3240 else printf(" \\x%02x", c);
3241 printf("%s", OP_names[*code++]);
3247 if (isprint(c = code[3])) printf(" %c{", c);
3248 else printf(" \\x%02x{", c);
3249 if (*code != OP_EXACT) printf("0,");
3250 printf("%d}", (code[1] << 8) + code[2]);
3251 if (*code == OP_MINUPTO) printf("?");
3257 case OP_TYPEMINUPTO:
3258 printf(" %s{", OP_names[code[3]]);
3259 if (*code != OP_TYPEEXACT) printf(",");
3260 printf("%d}", (code[1] << 8) + code[2]);
3261 if (*code == OP_TYPEMINUPTO) printf("?");
3266 if (isprint(c = *(++code))) printf(" [^%c]", c);
3267 else printf(" [^\\x%02x]", c);
3275 case OP_NOTMINQUERY:
3276 if (isprint(c = code[1])) printf(" [^%c]", c);
3277 else printf(" [^\\x%02x]", c);
3278 printf("%s", OP_names[*code++]);
3284 if (isprint(c = code[3])) printf(" [^%c]{", c);
3285 else printf(" [^\\x%02x]{", c);
3286 if (*code != OP_NOTEXACT) printf(",");
3287 printf("%d}", (code[1] << 8) + code[2]);
3288 if (*code == OP_NOTMINUPTO) printf("?");
3293 printf(" \\%d", *(++code));
3295 goto CLASS_REF_REPEAT;
3303 for (i = 0; i < 256; i++)
3305 if ((code[i/8] & (1 << (i&7))) != 0)
3308 for (j = i+1; j < 256; j++)
3309 if ((code[j/8] & (1 << (j&7))) == 0) break;
3310 if (i == '-' || i == ']') printf("\\");
3311 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3315 if (j == '-' || j == ']') printf("\\");
3316 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3334 printf("%s", OP_names[*code]);
3339 min = (code[1] << 8) + code[2];
3340 max = (code[3] << 8) + code[4];
3341 if (max == 0) printf("{%d,}", min);
3342 else printf("{%d,%d}", min, max);
3343 if (*code == OP_CRMINRANGE) printf("?");
3353 /* Anything else is just a one-node item */
3356 printf(" %s", OP_names[*code]);
3363 printf("------------------------------------------------------------------\n");
3365 /* This check is done here in the debugging case so that the code that
3366 was compiled can be seen. */
3368 if (code - re->code > length)
3372 *erroroffset = ptr - (uschar *)pattern;
3382 /*************************************************
3383 * Match a back-reference *
3384 *************************************************/
3386 /* If a back reference hasn't been set, the length that is passed is greater
3387 than the number of characters left in the string, so the match fails.
3390 offset index into the offset vector
3391 eptr points into the subject
3392 length length to be matched
3393 md points to match data block
3396 Returns: TRUE if matched
3400 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3401 unsigned long int ims)
3403 const uschar *p = md->start_subject + md->offset_vector[offset];
3406 if (eptr >= md->end_subject)
3407 printf("matching subject <null>");
3410 printf("matching subject ");
3411 pchars(eptr, length, TRUE, md);
3413 printf(" against backref ");
3414 pchars(p, length, FALSE, md);
3418 /* Always fail if not enough characters left */
3420 if (length > md->end_subject - eptr) return FALSE;
3422 /* Separate the caselesss case for speed */
3424 if ((ims & PCRE_CASELESS) != 0)
3426 while (length-- > 0)
3427 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3430 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3437 /*************************************************
3438 * Match from current position *
3439 *************************************************/
3441 /* On entry ecode points to the first opcode, and eptr to the first character
3442 in the subject string, while eptrb holds the value of eptr at the start of the
3443 last bracketed group - used for breaking infinite loops matching zero-length
3447 eptr pointer in subject
3448 ecode position in code
3449 offset_top current top pointer
3450 md pointer to "static" info for the match
3451 ims current /i, /m, and /s options
3452 eptrb pointer to chain of blocks containing eptr at start of
3453 brackets - for testing for empty matches
3455 match_condassert - this is an assertion condition
3456 match_isgroup - this is the start of a bracketed group
3458 Returns: TRUE if matched
3462 match(register const uschar *eptr, register const uschar *ecode,
3463 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3466 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3469 /* At the start of a bracketed group, add the current subject pointer to the
3470 stack of such pointers, to be re-instated at the end of the group when we hit
3471 the closing ket. When match() is called in other circumstances, we don't add to
3474 if ((flags & match_isgroup) != 0)
3476 newptrb.prev = eptrb;
3477 newptrb.saved_eptr = eptr;
3481 /* Now start processing the operations. */
3485 int op = (int)*ecode;
3486 int min, max, ctype;
3489 BOOL minimize = FALSE;
3491 /* Opening capturing bracket. If there is space in the offset vector, save
3492 the current subject position in the working slot at the top of the vector. We
3493 mustn't change the current values of the data slot, because they may be set
3494 from a previous iteration of this group, and be referred to by a reference
3497 If the bracket fails to match, we need to restore this value and also the
3498 values of the final offsets, in case they were set by a previous iteration of
3501 If there isn't enough space in the offset vector, treat this as if it were a
3502 non-capturing bracket. Don't worry about setting the flag for the error case
3503 here; that is handled in the code for KET. */
3507 int number = op - OP_BRA;
3508 int offset = number << 1;
3511 printf("start bracket %d subject=", number);
3512 pchars(eptr, 16, TRUE, md);
3516 if (offset < md->offset_max)
3518 int save_offset1 = md->offset_vector[offset];
3519 int save_offset2 = md->offset_vector[offset+1];
3520 int save_offset3 = md->offset_vector[md->offset_end - number];
3522 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3523 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3527 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3529 ecode += (ecode[1] << 8) + ecode[2];
3531 while (*ecode == OP_ALT);
3533 DPRINTF(("bracket %d failed\n", number));
3535 md->offset_vector[offset] = save_offset1;
3536 md->offset_vector[offset+1] = save_offset2;
3537 md->offset_vector[md->offset_end - number] = save_offset3;
3541 /* Insufficient room for saving captured contents */
3546 /* Other types of node can be handled by a switch */
3550 case OP_BRA: /* Non-capturing bracket: optimized */
3551 DPRINTF(("start bracket 0\n"));
3554 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3556 ecode += (ecode[1] << 8) + ecode[2];
3558 while (*ecode == OP_ALT);
3559 DPRINTF(("bracket 0 failed\n"));
3562 /* Conditional group: compilation checked that there are no more than
3563 two branches. If the condition is false, skipping the first branch takes us
3564 past the end if there is only one branch, but that's OK because that is
3565 exactly what going to the ket would do. */
3568 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3570 int offset = ecode[4] << 1; /* Doubled reference number */
3572 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3573 5 : 3 + (ecode[1] << 8) + ecode[2]),
3574 offset_top, md, ims, eptrb, match_isgroup);
3577 /* The condition is an assertion. Call match() to evaluate it - setting
3578 the final argument TRUE causes it to stop at the end of an assertion. */
3582 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3583 match_condassert | match_isgroup))
3585 ecode += 3 + (ecode[4] << 8) + ecode[5];
3586 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3588 else ecode += (ecode[1] << 8) + ecode[2];
3589 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3591 /* Control never reaches here */
3593 /* Skip over conditional reference data if encountered (should not be) */
3599 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3600 an empty string - recursion will then try other alternatives, if any. */
3603 if (md->notempty && eptr == md->start_match) return FALSE;
3604 md->end_match_ptr = eptr; /* Record where we ended */
3605 md->end_offset_top = offset_top; /* and how many extracts were taken */
3608 /* Change option settings */
3613 DPRINTF(("ims set to %02lx\n", ims));
3616 /* Assertion brackets. Check the alternative branches in turn - the
3617 matching won't pass the KET for an assertion. If any one branch matches,
3618 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3619 start of each branch to move the current point backwards, so the code at
3620 this level is identical to the lookahead case. */
3626 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3627 ecode += (ecode[1] << 8) + ecode[2];
3629 while (*ecode == OP_ALT);
3630 if (*ecode == OP_KET) return FALSE;
3632 /* If checking an assertion for a condition, return TRUE. */
3634 if ((flags & match_condassert) != 0) return TRUE;
3636 /* Continue from after the assertion, updating the offsets high water
3637 mark, since extracts may have been taken during the assertion. */
3639 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3641 offset_top = md->end_offset_top;
3644 /* Negative assertion: all branches must fail to match */
3647 case OP_ASSERTBACK_NOT:
3650 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3652 ecode += (ecode[1] << 8) + ecode[2];
3654 while (*ecode == OP_ALT);
3656 if ((flags & match_condassert) != 0) return TRUE;
3661 /* Move the subject pointer back. This occurs only at the start of
3662 each branch of a lookbehind assertion. If we are too close to the start to
3663 move back, this match function fails. When working with UTF-8 we move
3664 back a number of characters, not bytes. */
3668 c = (ecode[1] << 8) + ecode[2];
3669 for (i = 0; i < c; i++)
3675 eptr -= (ecode[1] << 8) + ecode[2];
3678 if (eptr < md->start_subject) return FALSE;
3682 /* Recursion matches the current regex, nested. If there are any capturing
3683 brackets started but not finished, we have to save their starting points
3684 and reinstate them after the recursion. However, we don't know how many
3685 such there are (offset_top records the completed total) so we just have
3686 to save all the potential data. There may be up to 99 such values, which
3687 is a bit large to put on the stack, but using malloc for small numbers
3688 seems expensive. As a compromise, the stack is used when there are fewer
3689 than 16 values to store; otherwise malloc is used. A problem is what to do
3690 if the malloc fails ... there is no way of returning to the top level with
3691 an error. Save the top 15 values on the stack, and accept that the rest
3702 if (c < 16) save = stacksave; else
3704 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3712 for (i = 1; i <= c; i++)
3713 save[i] = md->offset_vector[md->offset_end - i];
3714 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3716 for (i = 1; i <= c; i++)
3717 md->offset_vector[md->offset_end - i] = save[i];
3718 if (save != stacksave) (pcre_free)(save);
3719 if (!rc) return FALSE;
3721 /* In case the recursion has set more capturing values, save the final
3722 number, then move along the subject till after the recursive match,
3723 and advance one byte in the pattern code. */
3725 offset_top = md->end_offset_top;
3726 eptr = md->end_match_ptr;
3731 /* "Once" brackets are like assertion brackets except that after a match,
3732 the point in the subject string is not moved back. Thus there can never be
3733 a move back into the brackets. Check the alternative branches in turn - the
3734 matching won't pass the KET for this kind of subpattern. If any one branch
3735 matches, we carry on as at the end of a normal bracket, leaving the subject
3740 const uschar *prev = ecode;
3741 const uschar *saved_eptr = eptr;
3745 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3747 ecode += (ecode[1] << 8) + ecode[2];
3749 while (*ecode == OP_ALT);
3751 /* If hit the end of the group (which could be repeated), fail */
3753 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3755 /* Continue as from after the assertion, updating the offsets high water
3756 mark, since extracts may have been taken. */
3758 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3760 offset_top = md->end_offset_top;
3761 eptr = md->end_match_ptr;
3763 /* For a non-repeating ket, just continue at this level. This also
3764 happens for a repeating ket if no characters were matched in the group.
3765 This is the forcible breaking of infinite loops as implemented in Perl
3766 5.005. If there is an options reset, it will get obeyed in the normal
3767 course of events. */
3769 if (*ecode == OP_KET || eptr == saved_eptr)
3775 /* The repeating kets try the rest of the pattern or restart from the
3776 preceding bracket, in the appropriate order. We need to reset any options
3777 that changed within the bracket before re-running it, so check the next
3780 if (ecode[3] == OP_OPT)
3782 ims = (ims & ~PCRE_IMS) | ecode[4];
3783 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3786 if (*ecode == OP_KETRMIN)
3788 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3789 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3792 else /* OP_KETRMAX */
3794 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3795 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3800 /* An alternation is the end of a branch; scan along to find the end of the
3801 bracketed group and go to there. */
3804 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3807 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3808 that it may occur zero times. It may repeat infinitely, or not at all -
3809 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3810 repeat limits are compiled as a number of copies, with the optional ones