Line data Source code
1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /*
6 : This is a library of functions to support regular expressions whose syntax
7 : and semantics are as close as possible to those of the Perl 5 language. See
8 : the file Tech.Notes for some information on the internals.
9 :
10 : Written by: Philip Hazel <ph10@cam.ac.uk>
11 :
12 : Copyright (c) 1997-2000 University of Cambridge
13 :
14 : -----------------------------------------------------------------------------
15 : Permission is granted to anyone to use this software for any purpose on any
16 : computer system, and to redistribute it freely, subject to the following
17 : restrictions:
18 :
19 : 1. This software is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 :
23 : 2. The origin of this software must not be misrepresented, either by
24 : explicit claim or by omission.
25 :
26 : 3. Altered versions must be plainly marked as such, and must not be
27 : misrepresented as being the original software.
28 :
29 : 4. If PCRE is embedded in any software that is released under the GNU
30 : General Purpose Licence (GPL), then the terms of that licence shall
31 : supersede any condition above with which it is incompatible.
32 : -----------------------------------------------------------------------------
33 : */
34 :
35 :
36 : /* Define DEBUG to get debugging output on stdout. */
37 :
38 : /* #define DEBUG */
39 :
40 : /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 : inline, and there are *still* stupid compilers about that don't like indented
42 : pre-processor statements. I suppose it's only been 10 years... */
43 :
44 : #ifdef DEBUG
45 : #define DPRINTF(p) printf p
46 : #else
47 : #define DPRINTF(p) /*nothing*/
48 : #endif
49 :
50 : /* Include the internals header, which itself includes Standard C headers plus
51 : the external pcre header. */
52 :
53 : #include "internal.h"
54 :
55 :
56 : /* Allow compilation as C++ source code, should anybody want to do that. */
57 :
58 : #ifdef __cplusplus
59 : #define class pcre_class
60 : #endif
61 :
62 :
63 : /* Number of items on the nested bracket stacks at compile time. This should
64 : not be set greater than 200. */
65 :
66 : #define BRASTACK_SIZE 200
67 :
68 :
69 : /* The number of bytes in a literal character string above which we can't add
70 : any more is different when UTF-8 characters may be encountered. */
71 :
72 : #ifdef SUPPORT_UTF8
73 : #define MAXLIT 250
74 : #else
75 : #define MAXLIT 255
76 : #endif
77 :
78 :
79 : /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80 :
81 : static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 : static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83 :
84 : /* Text forms of OP_ values and things, for debugging (not all used) */
85 :
86 : #ifdef DEBUG
87 : static const char *OP_names[] = {
88 : "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 : "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90 : "Opt", "^", "$", "Any", "chars", "not",
91 : "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92 : "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93 : "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94 : "*", "*?", "+", "+?", "?", "??", "{", "{",
95 : "class", "Ref", "Recurse",
96 : "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97 : "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 : "Brazero", "Braminzero", "Bra"
99 : };
100 : #endif
101 :
102 : /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 : are simple data values; negative values are for special things like \d and so
104 : on. Zero means further processing is needed (for things like \x), or the escape
105 : is invalid. */
106 :
107 : static const short int escapes[] = {
108 : 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 : 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 : '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111 : 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 : 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 : 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 : '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115 : 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116 : 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 : 0, 0, -ESC_z /* x - z */
118 : };
119 :
120 : /* Tables of names of POSIX character classes and their lengths. The list is
121 : terminated by a zero length entry. The first three must be alpha, upper, lower,
122 : as this is assumed for handling case independence. */
123 :
124 : static const char *posix_names[] = {
125 : "alpha", "lower", "upper",
126 : "alnum", "ascii", "cntrl", "digit", "graph",
127 : "print", "punct", "space", "word", "xdigit" };
128 :
129 : static const uschar posix_name_lengths[] = {
130 : 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131 :
132 : /* Table of class bit maps for each POSIX class; up to three may be combined
133 : to form the class. */
134 :
135 : static const int posix_class_maps[] = {
136 : cbit_lower, cbit_upper, -1, /* alpha */
137 : cbit_lower, -1, -1, /* lower */
138 : cbit_upper, -1, -1, /* upper */
139 : cbit_digit, cbit_lower, cbit_upper, /* alnum */
140 : cbit_print, cbit_cntrl, -1, /* ascii */
141 : cbit_cntrl, -1, -1, /* cntrl */
142 : cbit_digit, -1, -1, /* digit */
143 : cbit_graph, -1, -1, /* graph */
144 : cbit_print, -1, -1, /* print */
145 : cbit_punct, -1, -1, /* punct */
146 : cbit_space, -1, -1, /* space */
147 : cbit_word, -1, -1, /* word */
148 : cbit_xdigit,-1, -1 /* xdigit */
149 : };
150 :
151 :
152 : /* Definition to allow mutual recursion */
153 :
154 : static BOOL
155 : compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 : BOOL, int, int *, int *, compile_data *);
157 :
158 : /* Structure for building a chain of data that actually lives on the
159 : stack, for holding the values of the subject pointer at the start of each
160 : subpattern, so as to detect when an empty string has been matched by a
161 : subpattern - to break infinite loops. */
162 :
163 : typedef struct eptrblock {
164 : struct eptrblock *prev;
165 : const uschar *saved_eptr;
166 : } eptrblock;
167 :
168 : /* Flag bits for the match() function */
169 :
170 : #define match_condassert 0x01 /* Called to check a condition assertion */
171 : #define match_isgroup 0x02 /* Set if start of bracketed group */
172 :
173 :
174 :
175 : /*************************************************
176 : * Global variables *
177 : *************************************************/
178 :
179 : /* PCRE is thread-clean and doesn't use any global variables in the normal
180 : sense. However, it calls memory allocation and free functions via the two
181 : indirections below, which are can be changed by the caller, but are shared
182 : between all threads. */
183 :
184 : void *(*pcre_malloc)(size_t) = malloc;
185 : void (*pcre_free)(void *) = free;
186 :
187 :
188 :
189 : /*************************************************
190 : * Macros and tables for character handling *
191 : *************************************************/
192 :
193 : /* When UTF-8 encoding is being used, a character is no longer just a single
194 : byte. The macros for character handling generate simple sequences when used in
195 : byte-mode, and more complicated ones for UTF-8 characters. */
196 :
197 : #ifndef SUPPORT_UTF8
198 : #define GETCHARINC(c, eptr) c = *eptr++;
199 : #define GETCHARLEN(c, eptr, len) c = *eptr;
200 : #define BACKCHAR(eptr)
201 :
202 : #else /* SUPPORT_UTF8 */
203 :
204 : /* Get the next UTF-8 character, advancing the pointer */
205 :
206 : #define GETCHARINC(c, eptr) \
207 : c = *eptr++; \
208 : if (md->utf8 && (c & 0xc0) == 0xc0) \
209 : { \
210 : int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211 : int s = 6 - a; /* Amount to shift next byte */ \
212 : c &= utf8_table3[a]; /* Low order bits from first byte */ \
213 : while (a-- > 0) \
214 : { \
215 : c |= (*eptr++ & 0x3f) << s; \
216 : s += 6; \
217 : } \
218 : }
219 :
220 : /* Get the next UTF-8 character, not advancing the pointer, setting length */
221 :
222 : #define GETCHARLEN(c, eptr, len) \
223 : c = *eptr; \
224 : len = 1; \
225 : if (md->utf8 && (c & 0xc0) == 0xc0) \
226 : { \
227 : int i; \
228 : int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229 : int s = 6 - a; /* Amount to shift next byte */ \
230 : c &= utf8_table3[a]; /* Low order bits from first byte */ \
231 : for (i = 1; i <= a; i++) \
232 : { \
233 : c |= (eptr[i] & 0x3f) << s; \
234 : s += 6; \
235 : } \
236 : len += a; \
237 : }
238 :
239 : /* If the pointer is not at the start of a character, move it back until
240 : it is. */
241 :
242 : #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243 :
244 : #endif
245 :
246 :
247 :
248 : /*************************************************
249 : * Default character tables *
250 : *************************************************/
251 :
252 : /* A default set of character tables is included in the PCRE binary. Its source
253 : is built by the maketables auxiliary program, which uses the default C ctypes
254 : functions, and put in the file chartables.c. These tables are used by PCRE
255 : whenever the caller of pcre_compile() does not provide an alternate set of
256 : tables. */
257 :
258 : #include "chartables.c"
259 :
260 :
261 :
262 : #ifdef SUPPORT_UTF8
263 : /*************************************************
264 : * Tables for UTF-8 support *
265 : *************************************************/
266 :
267 : /* These are the breakpoints for different numbers of bytes in a UTF-8
268 : character. */
269 :
270 : static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271 :
272 : /* These are the indicator bits and the mask for the data bits to set in the
273 : first byte of a character, indexed by the number of additional bytes. */
274 :
275 : static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276 : static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277 :
278 : /* Table of the number of extra characters, indexed by the first character
279 : masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280 : 0x3d. */
281 :
282 : static uschar utf8_table4[] = {
283 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286 : 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287 :
288 :
289 : /*************************************************
290 : * Convert character value to UTF-8 *
291 : *************************************************/
292 :
293 : /* This function takes an integer value in the range 0 - 0x7fffffff
294 : and encodes it as a UTF-8 character in 0 to 6 bytes.
295 :
296 : Arguments:
297 : cvalue the character value
298 : buffer pointer to buffer for result - at least 6 bytes long
299 :
300 : Returns: number of characters placed in the buffer
301 : */
302 :
303 : static int
304 : ord2utf8(int cvalue, uschar *buffer)
305 : {
306 : register int i, j;
307 : for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308 : if (cvalue <= utf8_table1[i]) break;
309 : *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310 : cvalue >>= 6 - i;
311 : for (j = 0; j < i; j++)
312 : {
313 : *buffer++ = 0x80 | (cvalue & 0x3f);
314 : cvalue >>= 6;
315 : }
316 : return i + 1;
317 : }
318 : #endif
319 :
320 :
321 :
322 : /*************************************************
323 : * Return version string *
324 : *************************************************/
325 :
326 : #define STRING(a) # a
327 : #define XSTRING(s) STRING(s)
328 :
329 : const char *
330 0 : pcre_version(void)
331 : {
332 0 : return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333 : }
334 :
335 :
336 :
337 :
338 : /*************************************************
339 : * (Obsolete) Return info about compiled pattern *
340 : *************************************************/
341 :
342 : /* This is the original "info" function. It picks potentially useful data out
343 : of the private structure, but its interface was too rigid. It remains for
344 : backwards compatibility. The public options are passed back in an int - though
345 : the re->options field has been expanded to a long int, all the public options
346 : at the low end of it, and so even on 16-bit systems this will still be OK.
347 : Therefore, I haven't changed the API for pcre_info().
348 :
349 : Arguments:
350 : external_re points to compiled code
351 : optptr where to pass back the options
352 : first_char where to pass back the first character,
353 : or -1 if multiline and all branches start ^,
354 : or -2 otherwise
355 :
356 : Returns: number of capturing subpatterns
357 : or negative values on error
358 : */
359 :
360 : int
361 0 : pcre_info(const pcre *external_re, int *optptr, int *first_char)
362 : {
363 0 : const real_pcre *re = (const real_pcre *)external_re;
364 0 : if (re == NULL) return PCRE_ERROR_NULL;
365 0 : if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 0 : if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 0 : if (first_char != NULL)
368 0 : *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369 0 : ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370 0 : return re->top_bracket;
371 : }
372 :
373 :
374 :
375 : /*************************************************
376 : * Return info about compiled pattern *
377 : *************************************************/
378 :
379 : /* This is a newer "info" function which has an extensible interface so
380 : that additional items can be added compatibly.
381 :
382 : Arguments:
383 : external_re points to compiled code
384 : external_study points to study data, or NULL
385 : what what information is required
386 : where where to put the information
387 :
388 : Returns: 0 if data returned, negative on error
389 : */
390 :
391 : int
392 0 : pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393 : void *where)
394 : {
395 0 : const real_pcre *re = (const real_pcre *)external_re;
396 0 : const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397 :
398 0 : if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399 0 : if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400 :
401 0 : switch (what)
402 : {
403 0 : case PCRE_INFO_OPTIONS:
404 0 : *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405 0 : break;
406 :
407 0 : case PCRE_INFO_SIZE:
408 0 : *((size_t *)where) = re->size;
409 0 : break;
410 :
411 0 : case PCRE_INFO_CAPTURECOUNT:
412 0 : *((int *)where) = re->top_bracket;
413 0 : break;
414 :
415 0 : case PCRE_INFO_BACKREFMAX:
416 0 : *((int *)where) = re->top_backref;
417 0 : break;
418 :
419 0 : case PCRE_INFO_FIRSTCHAR:
420 0 : *((int *)where) =
421 0 : ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422 0 : ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423 0 : break;
424 :
425 0 : case PCRE_INFO_FIRSTTABLE:
426 0 : *((const uschar **)where) =
427 0 : (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428 0 : study->start_bits : NULL;
429 0 : break;
430 :
431 0 : case PCRE_INFO_LASTLITERAL:
432 0 : *((int *)where) =
433 0 : ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434 0 : break;
435 :
436 0 : default: return PCRE_ERROR_BADOPTION;
437 : }
438 :
439 0 : return 0;
440 : }
441 :
442 :
443 :
444 : #ifdef DEBUG
445 : /*************************************************
446 : * Debugging function to print chars *
447 : *************************************************/
448 :
449 : /* Print a sequence of chars in printable format, stopping at the end of the
450 : subject if the requested.
451 :
452 : Arguments:
453 : p points to characters
454 : length number to print
455 : is_subject TRUE if printing from within md->start_subject
456 : md pointer to matching data block, if is_subject is TRUE
457 :
458 : Returns: nothing
459 : */
460 :
461 : static void
462 : pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
463 : {
464 : int c;
465 : if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
466 : while (length-- > 0)
467 : if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
468 : }
469 : #endif
470 :
471 :
472 :
473 :
474 : /*************************************************
475 : * Handle escapes *
476 : *************************************************/
477 :
478 : /* This function is called when a \ has been encountered. It either returns a
479 : positive value for a simple escape such as \n, or a negative value which
480 : encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481 : a positive value greater than 255 may be returned. On entry, ptr is pointing at
482 : the \. On exit, it is on the final character of the escape sequence.
483 :
484 : Arguments:
485 : ptrptr points to the pattern position pointer
486 : errorptr points to the pointer to the error message
487 : bracount number of previous extracting brackets
488 : options the options bits
489 : isclass TRUE if inside a character class
490 : cd pointer to char tables block
491 :
492 : Returns: zero or positive => a data character
493 : negative => a special escape sequence
494 : on error, errorptr is set
495 : */
496 :
497 : static int
498 0 : check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 : int options, BOOL isclass, compile_data *cd)
500 : {
501 0 : const uschar *ptr = *ptrptr;
502 : int c, i;
503 :
504 : /* If backslash is at the end of the pattern, it's an error. */
505 :
506 0 : c = *(++ptr);
507 0 : if (c == 0) *errorptr = ERR1;
508 :
509 : /* Digits or letters may have special meaning; all others are literals. */
510 :
511 0 : else if (c < '0' || c > 'z') {}
512 :
513 : /* Do an initial lookup in a table. A non-zero result is something that can be
514 : returned immediately. Otherwise further processing may be required. */
515 :
516 0 : else if ((i = escapes[c - '0']) != 0) c = i;
517 :
518 : /* Escapes that need further processing, or are illegal. */
519 :
520 : else
521 : {
522 : const uschar *oldptr;
523 0 : switch (c)
524 : {
525 : /* The handling of escape sequences consisting of a string of digits
526 : starting with one that is not zero is not straightforward. By experiment,
527 : the way Perl works seems to be as follows:
528 :
529 : Outside a character class, the digits are read as a decimal number. If the
530 : number is less than 10, or if there are that many previous extracting
531 : left brackets, then it is a back reference. Otherwise, up to three octal
532 : digits are read to form an escaped byte. Thus \123 is likely to be octal
533 : 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534 : value is greater than 377, the least significant 8 bits are taken. Inside a
535 : character class, \ followed by a digit is always an octal number. */
536 :
537 0 : case '1': case '2': case '3': case '4': case '5':
538 : case '6': case '7': case '8': case '9':
539 :
540 0 : if (!isclass)
541 : {
542 0 : oldptr = ptr;
543 0 : c -= '0';
544 0 : while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 0 : c = c * 10 + *(++ptr) - '0';
546 0 : if (c < 10 || c <= bracount)
547 : {
548 0 : c = -(ESC_REF + c);
549 0 : break;
550 : }
551 0 : ptr = oldptr; /* Put the pointer back and fall through */
552 : }
553 :
554 : /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555 : generates a binary zero byte and treats the digit as a following literal.
556 : Thus we have to pull back the pointer by one. */
557 :
558 0 : if ((c = *ptr) >= '8')
559 : {
560 0 : ptr--;
561 0 : c = 0;
562 0 : break;
563 : }
564 :
565 : /* \0 always starts an octal number, but we may drop through to here with a
566 : larger first octal digit. */
567 :
568 : case '0':
569 0 : c -= '0';
570 0 : while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 0 : ptr[1] != '8' && ptr[1] != '9')
572 0 : c = c * 8 + *(++ptr) - '0';
573 0 : c &= 255; /* Take least significant 8 bits */
574 0 : break;
575 :
576 : /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577 : which can be greater than 0xff, but only if the ddd are hex digits. */
578 :
579 0 : case 'x':
580 : #ifdef SUPPORT_UTF8
581 : if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582 : {
583 : const uschar *pt = ptr + 2;
584 : register int count = 0;
585 : c = 0;
586 : while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587 : {
588 : count++;
589 : c = c * 16 + cd->lcc[*pt] -
590 : (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591 : pt++;
592 : }
593 : if (*pt == '}')
594 : {
595 : if (c < 0 || count > 8) *errorptr = ERR34;
596 : ptr = pt;
597 : break;
598 : }
599 : /* If the sequence of hex digits does not end with '}', then we don't
600 : recognize this construct; fall through to the normal \x handling. */
601 : }
602 : #endif
603 :
604 : /* Read just a single hex char */
605 :
606 0 : c = 0;
607 0 : while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608 : {
609 0 : ptr++;
610 0 : c = c * 16 + cd->lcc[*ptr] -
611 0 : (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612 : }
613 0 : break;
614 :
615 : /* Other special escapes not starting with a digit are straightforward */
616 :
617 0 : case 'c':
618 0 : c = *(++ptr);
619 0 : if (c == 0)
620 : {
621 0 : *errorptr = ERR2;
622 0 : return 0;
623 : }
624 :
625 : /* A letter is upper-cased; then the 0x40 bit is flipped */
626 :
627 0 : if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628 0 : c ^= 0x40;
629 0 : break;
630 :
631 : /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632 : other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 : for Perl compatibility, it is a literal. This code looks a bit odd, but
634 : there used to be some cases other than the default, and there may be again
635 : in future, so I haven't "optimized" it. */
636 :
637 0 : default:
638 0 : if ((options & PCRE_EXTRA) != 0) switch(c)
639 : {
640 0 : default:
641 0 : *errorptr = ERR3;
642 0 : break;
643 : }
644 0 : break;
645 : }
646 0 : }
647 :
648 0 : *ptrptr = ptr;
649 0 : return c;
650 : }
651 :
652 :
653 :
654 : /*************************************************
655 : * Check for counted repeat *
656 : *************************************************/
657 :
658 : /* This function is called when a '{' is encountered in a place where it might
659 : start a quantifier. It looks ahead to see if it really is a quantifier or not.
660 : It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661 : where the ddds are digits.
662 :
663 : Arguments:
664 : p pointer to the first char after '{'
665 : cd pointer to char tables block
666 :
667 : Returns: TRUE or FALSE
668 : */
669 :
670 : static BOOL
671 0 : is_counted_repeat(const uschar *p, compile_data *cd)
672 : {
673 0 : if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674 0 : while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 0 : if (*p == '}') return TRUE;
676 :
677 0 : if (*p++ != ',') return FALSE;
678 0 : if (*p == '}') return TRUE;
679 :
680 0 : if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681 0 : while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682 0 : return (*p == '}');
683 : }
684 :
685 :
686 :
687 : /*************************************************
688 : * Read repeat counts *
689 : *************************************************/
690 :
691 : /* Read an item of the form {n,m} and return the values. This is called only
692 : after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693 : so the syntax is guaranteed to be correct, but we need to check the values.
694 :
695 : Arguments:
696 : p pointer to first char after '{'
697 : minp pointer to int for min
698 : maxp pointer to int for max
699 : returned as -1 if no max
700 : errorptr points to pointer to error message
701 : cd pointer to character tables clock
702 :
703 : Returns: pointer to '}' on success;
704 : current ptr on error, with errorptr set
705 : */
706 :
707 : static const uschar *
708 0 : read_repeat_counts(const uschar *p, int *minp, int *maxp,
709 : const char **errorptr, compile_data *cd)
710 : {
711 0 : int min = 0;
712 0 : int max = -1;
713 :
714 0 : while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715 :
716 0 : if (*p == '}') max = min; else
717 : {
718 0 : if (*(++p) != '}')
719 : {
720 0 : max = 0;
721 0 : while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 0 : if (max < min)
723 : {
724 0 : *errorptr = ERR4;
725 0 : return p;
726 : }
727 : }
728 : }
729 :
730 : /* Do paranoid checks, then fill in the required variables, and pass back the
731 : pointer to the terminating '}'. */
732 :
733 0 : if (min < 0 || min > 65535 || max < -1 || max > 65535)
734 0 : *errorptr = ERR5;
735 : else
736 : {
737 0 : *minp = min;
738 0 : *maxp = max;
739 : }
740 0 : return p;
741 : }
742 :
743 :
744 :
745 : /*************************************************
746 : * Find the fixed length of a pattern *
747 : *************************************************/
748 :
749 : /* Scan a pattern and compute the fixed length of subject that will match it,
750 : if the length is fixed. This is needed for dealing with backward assertions.
751 :
752 : Arguments:
753 : code points to the start of the pattern (the bracket)
754 : options the compiling options
755 :
756 : Returns: the fixed length, or -1 if there is no fixed length
757 : */
758 :
759 : static int
760 0 : find_fixedlength(uschar *code, int options)
761 : {
762 0 : int length = -1;
763 :
764 0 : register int branchlength = 0;
765 0 : register uschar *cc = code + 3;
766 :
767 : /* Scan along the opcodes for this branch. If we get to the end of the
768 : branch, check the length against that of the other branches. */
769 :
770 : for (;;)
771 0 : {
772 : int d;
773 0 : register int op = *cc;
774 0 : if (op >= OP_BRA) op = OP_BRA;
775 :
776 0 : switch (op)
777 : {
778 0 : case OP_BRA:
779 : case OP_ONCE:
780 : case OP_COND:
781 0 : d = find_fixedlength(cc, options);
782 0 : if (d < 0) return -1;
783 0 : branchlength += d;
784 0 : do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785 0 : cc += 3;
786 0 : break;
787 :
788 : /* Reached end of a branch; if it's a ket it is the end of a nested
789 : call. If it's ALT it is an alternation in a nested call. If it is
790 : END it's the end of the outer call. All can be handled by the same code. */
791 :
792 0 : case OP_ALT:
793 : case OP_KET:
794 : case OP_KETRMAX:
795 : case OP_KETRMIN:
796 : case OP_END:
797 0 : if (length < 0) length = branchlength;
798 0 : else if (length != branchlength) return -1;
799 0 : if (*cc != OP_ALT) return length;
800 0 : cc += 3;
801 0 : branchlength = 0;
802 0 : break;
803 :
804 : /* Skip over assertive subpatterns */
805 :
806 0 : case OP_ASSERT:
807 : case OP_ASSERT_NOT:
808 : case OP_ASSERTBACK:
809 : case OP_ASSERTBACK_NOT:
810 0 : do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 0 : cc += 3;
812 0 : break;
813 :
814 : /* Skip over things that don't match chars */
815 :
816 0 : case OP_REVERSE:
817 0 : cc++;
818 : /* Fall through */
819 :
820 0 : case OP_CREF:
821 : case OP_OPT:
822 0 : cc++;
823 : /* Fall through */
824 :
825 0 : case OP_SOD:
826 : case OP_EOD:
827 : case OP_EODN:
828 : case OP_CIRC:
829 : case OP_DOLL:
830 : case OP_NOT_WORD_BOUNDARY:
831 : case OP_WORD_BOUNDARY:
832 0 : cc++;
833 0 : break;
834 :
835 : /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836 : This requires a scan of the string, unfortunately. We assume valid UTF-8
837 : strings, so all we do is reduce the length by one for byte whose bits are
838 : 10xxxxxx. */
839 :
840 0 : case OP_CHARS:
841 0 : branchlength += *(++cc);
842 : #ifdef SUPPORT_UTF8
843 : for (d = 1; d <= *cc; d++)
844 : if ((cc[d] & 0xc0) == 0x80) branchlength--;
845 : #endif
846 0 : cc += *cc + 1;
847 0 : break;
848 :
849 : /* Handle exact repetitions */
850 :
851 0 : case OP_EXACT:
852 : case OP_TYPEEXACT:
853 0 : branchlength += (cc[1] << 8) + cc[2];
854 0 : cc += 4;
855 0 : break;
856 :
857 : /* Handle single-char matchers */
858 :
859 0 : case OP_NOT_DIGIT:
860 : case OP_DIGIT:
861 : case OP_NOT_WHITESPACE:
862 : case OP_WHITESPACE:
863 : case OP_NOT_WORDCHAR:
864 : case OP_WORDCHAR:
865 : case OP_ANY:
866 0 : branchlength++;
867 0 : cc++;
868 0 : break;
869 :
870 :
871 : /* Check a class for variable quantification */
872 :
873 0 : case OP_CLASS:
874 0 : cc += (*cc == OP_REF)? 2 : 33;
875 :
876 0 : switch (*cc)
877 : {
878 0 : case OP_CRSTAR:
879 : case OP_CRMINSTAR:
880 : case OP_CRQUERY:
881 : case OP_CRMINQUERY:
882 0 : return -1;
883 :
884 0 : case OP_CRRANGE:
885 : case OP_CRMINRANGE:
886 0 : if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887 0 : branchlength += (cc[1] << 8) + cc[2];
888 0 : cc += 5;
889 0 : break;
890 :
891 0 : default:
892 0 : branchlength++;
893 : }
894 0 : break;
895 :
896 : /* Anything else is variable length */
897 :
898 0 : default:
899 0 : return -1;
900 : }
901 : }
902 : /* Control never gets here */
903 : }
904 :
905 :
906 :
907 :
908 : /*************************************************
909 : * Check for POSIX class syntax *
910 : *************************************************/
911 :
912 : /* This function is called when the sequence "[:" or "[." or "[=" is
913 : encountered in a character class. It checks whether this is followed by an
914 : optional ^ and then a sequence of letters, terminated by a matching ":]" or
915 : ".]" or "=]".
916 :
917 : Argument:
918 : ptr pointer to the initial [
919 : endptr where to return the end pointer
920 : cd pointer to compile data
921 :
922 : Returns: TRUE or FALSE
923 : */
924 :
925 : static BOOL
926 0 : check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927 : {
928 : int terminator; /* Don't combine these lines; the Solaris cc */
929 0 : terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930 0 : if (*(++ptr) == '^') ptr++;
931 0 : while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932 0 : if (*ptr == terminator && ptr[1] == ']')
933 : {
934 0 : *endptr = ptr;
935 0 : return TRUE;
936 : }
937 0 : return FALSE;
938 : }
939 :
940 :
941 :
942 :
943 : /*************************************************
944 : * Check POSIX class name *
945 : *************************************************/
946 :
947 : /* This function is called to check the name given in a POSIX-style class entry
948 : such as [:alnum:].
949 :
950 : Arguments:
951 : ptr points to the first letter
952 : len the length of the name
953 :
954 : Returns: a value representing the name, or -1 if unknown
955 : */
956 :
957 : static int
958 0 : check_posix_name(const uschar *ptr, int len)
959 : {
960 0 : register int yield = 0;
961 0 : while (posix_name_lengths[yield] != 0)
962 : {
963 0 : if (len == posix_name_lengths[yield] &&
964 0 : strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965 0 : yield++;
966 : }
967 0 : return -1;
968 : }
969 :
970 :
971 :
972 :
973 : /*************************************************
974 : * Compile one branch *
975 : *************************************************/
976 :
977 : /* Scan the pattern, compiling it into the code vector.
978 :
979 : Arguments:
980 : options the option bits
981 : brackets points to number of brackets used
982 : code points to the pointer to the current code point
983 : ptrptr points to the current pattern pointer
984 : errorptr points to pointer to error message
985 : optchanged set to the value of the last OP_OPT item compiled
986 : reqchar set to the last literal character required, else -1
987 : countlits set to count of mandatory literal characters
988 : cd contains pointers to tables
989 :
990 : Returns: TRUE on success
991 : FALSE, with *errorptr set on error
992 : */
993 :
994 : static BOOL
995 0 : compile_branch(int options, int *brackets, uschar **codeptr,
996 : const uschar **ptrptr, const char **errorptr, int *optchanged,
997 : int *reqchar, int *countlits, compile_data *cd)
998 : {
999 : int repeat_type, op_type;
1000 : int repeat_min, repeat_max;
1001 : int bravalue, length;
1002 : int greedy_default, greedy_non_default;
1003 : int prevreqchar;
1004 0 : int condcount = 0;
1005 0 : int subcountlits = 0;
1006 : register int c;
1007 0 : register uschar *code = *codeptr;
1008 : uschar *tempcode;
1009 0 : const uschar *ptr = *ptrptr;
1010 : const uschar *tempptr;
1011 0 : uschar *previous = NULL;
1012 : uschar class[32];
1013 :
1014 : /* Set up the default and non-default settings for greediness */
1015 :
1016 0 : greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 0 : greedy_non_default = greedy_default ^ 1;
1018 :
1019 : /* Initialize no required char, and count of literals */
1020 :
1021 0 : *reqchar = prevreqchar = -1;
1022 0 : *countlits = 0;
1023 :
1024 : /* Switch on next character until the end of the branch */
1025 :
1026 0 : for (;; ptr++)
1027 0 : {
1028 : BOOL negate_class;
1029 : int class_charcount;
1030 : int class_lastchar;
1031 : int newoptions;
1032 : int condref;
1033 : int subreqchar;
1034 :
1035 0 : c = *ptr;
1036 0 : if ((options & PCRE_EXTENDED) != 0)
1037 : {
1038 0 : if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039 0 : if (c == '#')
1040 : {
1041 : /* The space before the ; is to avoid a warning on a silly compiler
1042 : on the Macintosh. */
1043 0 : while ((c = *(++ptr)) != 0 && c != '\n') ;
1044 0 : continue;
1045 : }
1046 : }
1047 :
1048 0 : switch(c)
1049 : {
1050 : /* The branch terminates at end of string, |, or ). */
1051 :
1052 0 : case 0:
1053 : case '|':
1054 : case ')':
1055 0 : *codeptr = code;
1056 0 : *ptrptr = ptr;
1057 0 : return TRUE;
1058 :
1059 : /* Handle single-character metacharacters */
1060 :
1061 0 : case '^':
1062 0 : previous = NULL;
1063 0 : *code++ = OP_CIRC;
1064 0 : break;
1065 :
1066 0 : case '$':
1067 0 : previous = NULL;
1068 0 : *code++ = OP_DOLL;
1069 0 : break;
1070 :
1071 0 : case '.':
1072 0 : previous = code;
1073 0 : *code++ = OP_ANY;
1074 0 : break;
1075 :
1076 : /* Character classes. These always build a 32-byte bitmap of the permitted
1077 : characters, except in the special case where there is only one character.
1078 : For negated classes, we build the map as usual, then invert it at the end.
1079 : */
1080 :
1081 0 : case '[':
1082 0 : previous = code;
1083 0 : *code++ = OP_CLASS;
1084 :
1085 : /* If the first character is '^', set the negation flag and skip it. */
1086 :
1087 0 : if ((c = *(++ptr)) == '^')
1088 : {
1089 0 : negate_class = TRUE;
1090 0 : c = *(++ptr);
1091 : }
1092 0 : else negate_class = FALSE;
1093 :
1094 : /* Keep a count of chars so that we can optimize the case of just a single
1095 : character. */
1096 :
1097 0 : class_charcount = 0;
1098 0 : class_lastchar = -1;
1099 :
1100 : /* Initialize the 32-char bit map to all zeros. We have to build the
1101 : map in a temporary bit of store, in case the class contains only 1
1102 : character, because in that case the compiled code doesn't use the
1103 : bit map. */
1104 :
1105 0 : memset(class, 0, 32 * sizeof(uschar));
1106 :
1107 : /* Process characters until ] is reached. By writing this as a "do" it
1108 : means that an initial ] is taken as a data character. */
1109 :
1110 : do
1111 : {
1112 0 : if (c == 0)
1113 : {
1114 0 : *errorptr = ERR6;
1115 0 : goto FAILED;
1116 : }
1117 :
1118 : /* Handle POSIX class names. Perl allows a negation extension of the
1119 : form [:^name]. A square bracket that doesn't match the syntax is
1120 : treated as a literal. We also recognize the POSIX constructions
1121 : [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122 : 5.6 does. */
1123 :
1124 0 : if (c == '[' &&
1125 0 : (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126 0 : check_posix_syntax(ptr, &tempptr, cd))
1127 : {
1128 0 : BOOL local_negate = FALSE;
1129 : int posix_class, i;
1130 0 : register const uschar *cbits = cd->cbits;
1131 :
1132 0 : if (ptr[1] != ':')
1133 : {
1134 0 : *errorptr = ERR31;
1135 0 : goto FAILED;
1136 : }
1137 :
1138 0 : ptr += 2;
1139 0 : if (*ptr == '^')
1140 : {
1141 0 : local_negate = TRUE;
1142 0 : ptr++;
1143 : }
1144 :
1145 0 : posix_class = check_posix_name(ptr, tempptr - ptr);
1146 0 : if (posix_class < 0)
1147 : {
1148 0 : *errorptr = ERR30;
1149 0 : goto FAILED;
1150 : }
1151 :
1152 : /* If matching is caseless, upper and lower are converted to
1153 : alpha. This relies on the fact that the class table starts with
1154 : alpha, lower, upper as the first 3 entries. */
1155 :
1156 0 : if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157 0 : posix_class = 0;
1158 :
1159 : /* Or into the map we are building up to 3 of the static class
1160 : tables, or their negations. */
1161 :
1162 0 : posix_class *= 3;
1163 0 : for (i = 0; i < 3; i++)
1164 : {
1165 0 : int taboffset = posix_class_maps[posix_class + i];
1166 0 : if (taboffset < 0) break;
1167 0 : if (local_negate)
1168 0 : for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169 : else
1170 0 : for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171 : }
1172 :
1173 0 : ptr = tempptr + 1;
1174 0 : class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1175 0 : continue;
1176 : }
1177 :
1178 : /* Backslash may introduce a single character, or it may introduce one
1179 : of the specials, which just set a flag. Escaped items are checked for
1180 : validity in the pre-compiling pass. The sequence \b is a special case.
1181 : Inside a class (and only there) it is treated as backspace. Elsewhere
1182 : it marks a word boundary. Other escapes have preset maps ready to
1183 : or into the one we are building. We assume they have more than one
1184 : character in them, so set class_count bigger than one. */
1185 :
1186 0 : if (c == '\\')
1187 : {
1188 0 : c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 0 : if (-c == ESC_b) c = '\b';
1190 0 : else if (c < 0)
1191 : {
1192 0 : register const uschar *cbits = cd->cbits;
1193 0 : class_charcount = 10;
1194 0 : switch (-c)
1195 : {
1196 0 : case ESC_d:
1197 0 : for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198 0 : continue;
1199 :
1200 0 : case ESC_D:
1201 0 : for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202 0 : continue;
1203 :
1204 0 : case ESC_w:
1205 0 : for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1206 0 : continue;
1207 :
1208 0 : case ESC_W:
1209 0 : for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1210 0 : continue;
1211 :
1212 0 : case ESC_s:
1213 0 : for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214 0 : continue;
1215 :
1216 0 : case ESC_S:
1217 0 : for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218 0 : continue;
1219 :
1220 0 : default:
1221 0 : *errorptr = ERR7;
1222 0 : goto FAILED;
1223 : }
1224 : }
1225 :
1226 : /* Fall through if single character, but don't at present allow
1227 : chars > 255 in UTF-8 mode. */
1228 :
1229 : #ifdef SUPPORT_UTF8
1230 : if (c > 255)
1231 : {
1232 : *errorptr = ERR33;
1233 : goto FAILED;
1234 : }
1235 : #endif
1236 : }
1237 :
1238 : /* A single character may be followed by '-' to form a range. However,
1239 : Perl does not permit ']' to be the end of the range. A '-' character
1240 : here is treated as a literal. */
1241 :
1242 0 : if (ptr[1] == '-' && ptr[2] != ']')
1243 : {
1244 : int d;
1245 0 : ptr += 2;
1246 0 : d = *ptr;
1247 :
1248 0 : if (d == 0)
1249 : {
1250 0 : *errorptr = ERR6;
1251 0 : goto FAILED;
1252 : }
1253 :
1254 : /* The second part of a range can be a single-character escape, but
1255 : not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256 : in such circumstances. */
1257 :
1258 0 : if (d == '\\')
1259 : {
1260 0 : const uschar *oldptr = ptr;
1261 0 : d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262 :
1263 : #ifdef SUPPORT_UTF8
1264 : if (d > 255)
1265 : {
1266 : *errorptr = ERR33;
1267 : goto FAILED;
1268 : }
1269 : #endif
1270 : /* \b is backslash; any other special means the '-' was literal */
1271 :
1272 0 : if (d < 0)
1273 : {
1274 0 : if (d == -ESC_b) d = '\b'; else
1275 : {
1276 0 : ptr = oldptr - 2;
1277 0 : goto SINGLE_CHARACTER; /* A few lines below */
1278 : }
1279 : }
1280 : }
1281 :
1282 0 : if (d < c)
1283 : {
1284 0 : *errorptr = ERR8;
1285 0 : goto FAILED;
1286 : }
1287 :
1288 0 : for (; c <= d; c++)
1289 : {
1290 0 : class[c/8] |= (1 << (c&7));
1291 0 : if ((options & PCRE_CASELESS) != 0)
1292 : {
1293 0 : int uc = cd->fcc[c]; /* flip case */
1294 0 : class[uc/8] |= (1 << (uc&7));
1295 : }
1296 0 : class_charcount++; /* in case a one-char range */
1297 0 : class_lastchar = c;
1298 : }
1299 0 : continue; /* Go get the next char in the class */
1300 : }
1301 :
1302 : /* Handle a lone single character - we can get here for a normal
1303 : non-escape char, or after \ that introduces a single character. */
1304 :
1305 0 : SINGLE_CHARACTER:
1306 :
1307 0 : class [c/8] |= (1 << (c&7));
1308 0 : if ((options & PCRE_CASELESS) != 0)
1309 : {
1310 0 : c = cd->fcc[c]; /* flip case */
1311 0 : class[c/8] |= (1 << (c&7));
1312 : }
1313 0 : class_charcount++;
1314 0 : class_lastchar = c;
1315 : }
1316 :
1317 : /* Loop until ']' reached; the check for end of string happens inside the
1318 : loop. This "while" is the end of the "do" above. */
1319 :
1320 0 : while ((c = *(++ptr)) != ']');
1321 :
1322 : /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323 : precisely one character. This doesn't need the whole 32-byte bit map.
1324 : We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1325 : it's negative. */
1326 :
1327 0 : if (class_charcount == 1 && class_lastchar >= 0)
1328 : {
1329 0 : if (negate_class)
1330 : {
1331 0 : code[-1] = OP_NOT;
1332 : }
1333 : else
1334 : {
1335 0 : code[-1] = OP_CHARS;
1336 0 : *code++ = 1;
1337 : }
1338 0 : *code++ = class_lastchar;
1339 : }
1340 :
1341 : /* Otherwise, negate the 32-byte map if necessary, and copy it into
1342 : the code vector. */
1343 :
1344 : else
1345 : {
1346 0 : if (negate_class)
1347 0 : for (c = 0; c < 32; c++) code[c] = ~class[c];
1348 : else
1349 0 : memcpy(code, class, 32);
1350 0 : code += 32;
1351 : }
1352 0 : break;
1353 :
1354 : /* Various kinds of repeat */
1355 :
1356 0 : case '{':
1357 0 : if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358 0 : ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 0 : if (*errorptr != NULL) goto FAILED;
1360 0 : goto REPEAT;
1361 :
1362 0 : case '*':
1363 0 : repeat_min = 0;
1364 0 : repeat_max = -1;
1365 0 : goto REPEAT;
1366 :
1367 0 : case '+':
1368 0 : repeat_min = 1;
1369 0 : repeat_max = -1;
1370 0 : goto REPEAT;
1371 :
1372 0 : case '?':
1373 0 : repeat_min = 0;
1374 0 : repeat_max = 1;
1375 :
1376 0 : REPEAT:
1377 0 : if (previous == NULL)
1378 : {
1379 0 : *errorptr = ERR9;
1380 0 : goto FAILED;
1381 : }
1382 :
1383 : /* If the next character is '?' this is a minimizing repeat, by default,
1384 : but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385 : next character. */
1386 :
1387 0 : if (ptr[1] == '?')
1388 0 : { repeat_type = greedy_non_default; ptr++; }
1389 0 : else repeat_type = greedy_default;
1390 :
1391 : /* If previous was a string of characters, chop off the last one and use it
1392 : as the subject of the repeat. If there was only one character, we can
1393 : abolish the previous item altogether. A repeat with a zero minimum wipes
1394 : out any reqchar setting, backing up to the previous value. We must also
1395 : adjust the countlits value. */
1396 :
1397 0 : if (*previous == OP_CHARS)
1398 : {
1399 0 : int len = previous[1];
1400 :
1401 0 : if (repeat_min == 0) *reqchar = prevreqchar;
1402 0 : *countlits += repeat_min - 1;
1403 :
1404 0 : if (len == 1)
1405 : {
1406 0 : c = previous[2];
1407 0 : code = previous;
1408 : }
1409 : else
1410 : {
1411 0 : c = previous[len+1];
1412 0 : previous[1]--;
1413 0 : code--;
1414 : }
1415 0 : op_type = 0; /* Use single-char op codes */
1416 0 : goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1417 : }
1418 :
1419 : /* If previous was a single negated character ([^a] or similar), we use
1420 : one of the special opcodes, replacing it. The code is shared with single-
1421 : character repeats by adding a suitable offset into repeat_type. */
1422 :
1423 0 : else if ((int)*previous == OP_NOT)
1424 : {
1425 0 : op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1426 0 : c = previous[1];
1427 0 : code = previous;
1428 0 : goto OUTPUT_SINGLE_REPEAT;
1429 : }
1430 :
1431 : /* If previous was a character type match (\d or similar), abolish it and
1432 : create a suitable repeat item. The code is shared with single-character
1433 : repeats by adding a suitable offset into repeat_type. */
1434 :
1435 0 : else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436 : {
1437 0 : op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1438 0 : c = *previous;
1439 0 : code = previous;
1440 :
1441 0 : OUTPUT_SINGLE_REPEAT:
1442 :
1443 : /* If the maximum is zero then the minimum must also be zero; Perl allows
1444 : this case, so we do too - by simply omitting the item altogether. */
1445 :
1446 0 : if (repeat_max == 0) goto END_REPEAT;
1447 :
1448 : /* Combine the op_type with the repeat_type */
1449 :
1450 0 : repeat_type += op_type;
1451 :
1452 : /* A minimum of zero is handled either as the special case * or ?, or as
1453 : an UPTO, with the maximum given. */
1454 :
1455 0 : if (repeat_min == 0)
1456 : {
1457 0 : if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458 0 : else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1459 : else
1460 : {
1461 0 : *code++ = OP_UPTO + repeat_type;
1462 0 : *code++ = repeat_max >> 8;
1463 0 : *code++ = (repeat_max & 255);
1464 : }
1465 : }
1466 :
1467 : /* The case {1,} is handled as the special case + */
1468 :
1469 0 : else if (repeat_min == 1 && repeat_max == -1)
1470 0 : *code++ = OP_PLUS + repeat_type;
1471 :
1472 : /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473 : handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1474 :
1475 : else
1476 : {
1477 0 : if (repeat_min != 1)
1478 : {
1479 0 : *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480 0 : *code++ = repeat_min >> 8;
1481 0 : *code++ = (repeat_min & 255);
1482 : }
1483 :
1484 : /* If the mininum is 1 and the previous item was a character string,
1485 : we either have to put back the item that got cancelled if the string
1486 : length was 1, or add the character back onto the end of a longer
1487 : string. For a character type nothing need be done; it will just get
1488 : put back naturally. Note that the final character is always going to
1489 : get added below. */
1490 :
1491 0 : else if (*previous == OP_CHARS)
1492 : {
1493 0 : if (code == previous) code += 2; else previous[1]++;
1494 : }
1495 :
1496 : /* For a single negated character we also have to put back the
1497 : item that got cancelled. */
1498 :
1499 0 : else if (*previous == OP_NOT) code++;
1500 :
1501 : /* If the maximum is unlimited, insert an OP_STAR. */
1502 :
1503 0 : if (repeat_max < 0)
1504 : {
1505 0 : *code++ = c;
1506 0 : *code++ = OP_STAR + repeat_type;
1507 : }
1508 :
1509 : /* Else insert an UPTO if the max is greater than the min. */
1510 :
1511 0 : else if (repeat_max != repeat_min)
1512 : {
1513 0 : *code++ = c;
1514 0 : repeat_max -= repeat_min;
1515 0 : *code++ = OP_UPTO + repeat_type;
1516 0 : *code++ = repeat_max >> 8;
1517 0 : *code++ = (repeat_max & 255);
1518 : }
1519 : }
1520 :
1521 : /* The character or character type itself comes last in all cases. */
1522 :
1523 0 : *code++ = c;
1524 : }
1525 :
1526 : /* If previous was a character class or a back reference, we put the repeat
1527 : stuff after it, but just skip the item if the repeat was {0,0}. */
1528 :
1529 0 : else if (*previous == OP_CLASS || *previous == OP_REF)
1530 : {
1531 0 : if (repeat_max == 0)
1532 : {
1533 0 : code = previous;
1534 0 : goto END_REPEAT;
1535 : }
1536 0 : if (repeat_min == 0 && repeat_max == -1)
1537 0 : *code++ = OP_CRSTAR + repeat_type;
1538 0 : else if (repeat_min == 1 && repeat_max == -1)
1539 0 : *code++ = OP_CRPLUS + repeat_type;
1540 0 : else if (repeat_min == 0 && repeat_max == 1)
1541 0 : *code++ = OP_CRQUERY + repeat_type;
1542 : else
1543 : {
1544 0 : *code++ = OP_CRRANGE + repeat_type;
1545 0 : *code++ = repeat_min >> 8;
1546 0 : *code++ = repeat_min & 255;
1547 0 : if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548 0 : *code++ = repeat_max >> 8;
1549 0 : *code++ = repeat_max & 255;
1550 : }
1551 : }
1552 :
1553 : /* If previous was a bracket group, we may have to replicate it in certain
1554 : cases. */
1555 :
1556 0 : else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557 0 : (int)*previous == OP_COND)
1558 0 : {
1559 : register int i;
1560 0 : int ketoffset = 0;
1561 0 : int len = code - previous;
1562 0 : uschar *bralink = NULL;
1563 :
1564 : /* If the maximum repeat count is unlimited, find the end of the bracket
1565 : by scanning through from the start, and compute the offset back to it
1566 : from the current code pointer. There may be an OP_OPT setting following
1567 : the final KET, so we can't find the end just by going back from the code
1568 : pointer. */
1569 :
1570 0 : if (repeat_max == -1)
1571 : {
1572 0 : register uschar *ket = previous;
1573 0 : do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574 0 : ketoffset = code - ket;
1575 : }
1576 :
1577 : /* The case of a zero minimum is special because of the need to stick
1578 : OP_BRAZERO in front of it, and because the group appears once in the
1579 : data, whereas in other cases it appears the minimum number of times. For
1580 : this reason, it is simplest to treat this case separately, as otherwise
1581 : the code gets far too mess. There are several special subcases when the
1582 : minimum is zero. */
1583 :
1584 0 : if (repeat_min == 0)
1585 : {
1586 : /* If we set up a required char from the bracket, we must back off
1587 : to the previous value and reset the countlits value too. */
1588 :
1589 0 : if (subcountlits > 0)
1590 : {
1591 0 : *reqchar = prevreqchar;
1592 0 : *countlits -= subcountlits;
1593 : }
1594 :
1595 : /* If the maximum is also zero, we just omit the group from the output
1596 : altogether. */
1597 :
1598 0 : if (repeat_max == 0)
1599 : {
1600 0 : code = previous;
1601 0 : goto END_REPEAT;
1602 : }
1603 :
1604 : /* If the maximum is 1 or unlimited, we just have to stick in the
1605 : BRAZERO and do no more at this point. */
1606 :
1607 0 : if (repeat_max <= 1)
1608 : {
1609 0 : memmove(previous+1, previous, len);
1610 0 : code++;
1611 0 : *previous++ = OP_BRAZERO + repeat_type;
1612 : }
1613 :
1614 : /* If the maximum is greater than 1 and limited, we have to replicate
1615 : in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616 : The first one has to be handled carefully because it's the original
1617 : copy, which has to be moved up. The remainder can be handled by code
1618 : that is common with the non-zero minimum case below. We just have to
1619 : adjust the value or repeat_max, since one less copy is required. */
1620 :
1621 : else
1622 : {
1623 : int offset;
1624 0 : memmove(previous+4, previous, len);
1625 0 : code += 4;
1626 0 : *previous++ = OP_BRAZERO + repeat_type;
1627 0 : *previous++ = OP_BRA;
1628 :
1629 : /* We chain together the bracket offset fields that have to be
1630 : filled in later when the ends of the brackets are reached. */
1631 :
1632 0 : offset = (bralink == NULL)? 0 : previous - bralink;
1633 0 : bralink = previous;
1634 0 : *previous++ = offset >> 8;
1635 0 : *previous++ = offset & 255;
1636 : }
1637 :
1638 0 : repeat_max--;
1639 : }
1640 :
1641 : /* If the minimum is greater than zero, replicate the group as many
1642 : times as necessary, and adjust the maximum to the number of subsequent
1643 : copies that we need. */
1644 :
1645 : else
1646 : {
1647 0 : for (i = 1; i < repeat_min; i++)
1648 : {
1649 0 : memcpy(code, previous, len);
1650 0 : code += len;
1651 : }
1652 0 : if (repeat_max > 0) repeat_max -= repeat_min;
1653 : }
1654 :
1655 : /* This code is common to both the zero and non-zero minimum cases. If
1656 : the maximum is limited, it replicates the group in a nested fashion,
1657 : remembering the bracket starts on a stack. In the case of a zero minimum,
1658 : the first one was set up above. In all cases the repeat_max now specifies
1659 : the number of additional copies needed. */
1660 :
1661 0 : if (repeat_max >= 0)
1662 : {
1663 0 : for (i = repeat_max - 1; i >= 0; i--)
1664 : {
1665 0 : *code++ = OP_BRAZERO + repeat_type;
1666 :
1667 : /* All but the final copy start a new nesting, maintaining the
1668 : chain of brackets outstanding. */
1669 :
1670 0 : if (i != 0)
1671 : {
1672 : int offset;
1673 0 : *code++ = OP_BRA;
1674 0 : offset = (bralink == NULL)? 0 : code - bralink;
1675 0 : bralink = code;
1676 0 : *code++ = offset >> 8;
1677 0 : *code++ = offset & 255;
1678 : }
1679 :
1680 0 : memcpy(code, previous, len);
1681 0 : code += len;
1682 : }
1683 :
1684 : /* Now chain through the pending brackets, and fill in their length
1685 : fields (which are holding the chain links pro tem). */
1686 :
1687 0 : while (bralink != NULL)
1688 : {
1689 : int oldlinkoffset;
1690 0 : int offset = code - bralink + 1;
1691 0 : uschar *bra = code - offset;
1692 0 : oldlinkoffset = (bra[1] << 8) + bra[2];
1693 0 : bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694 0 : *code++ = OP_KET;
1695 0 : *code++ = bra[1] = offset >> 8;
1696 0 : *code++ = bra[2] = (offset & 255);
1697 : }
1698 : }
1699 :
1700 : /* If the maximum is unlimited, set a repeater in the final copy. We
1701 : can't just offset backwards from the current code point, because we
1702 : don't know if there's been an options resetting after the ket. The
1703 : correct offset was computed above. */
1704 :
1705 0 : else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706 : }
1707 :
1708 : /* Else there's some kind of shambles */
1709 :
1710 : else
1711 : {
1712 0 : *errorptr = ERR11;
1713 0 : goto FAILED;
1714 : }
1715 :
1716 : /* In all case we no longer have a previous item. */
1717 :
1718 0 : END_REPEAT:
1719 0 : previous = NULL;
1720 0 : break;
1721 :
1722 :
1723 : /* Start of nested bracket sub-expression, or comment or lookahead or
1724 : lookbehind or option setting or condition. First deal with special things
1725 : that can come after a bracket; all are introduced by ?, and the appearance
1726 : of any of them means that this is not a referencing group. They were
1727 : checked for validity in the first pass over the string, so we don't have to
1728 : check for syntax errors here. */
1729 :
1730 0 : case '(':
1731 0 : newoptions = options;
1732 0 : condref = -1;
1733 :
1734 0 : if (*(++ptr) == '?')
1735 : {
1736 : int set, unset;
1737 : int *optset;
1738 :
1739 0 : switch (*(++ptr))
1740 : {
1741 0 : case '#': /* Comment; skip to ket */
1742 0 : ptr++;
1743 0 : while (*ptr != ')') ptr++;
1744 0 : continue;
1745 :
1746 0 : case ':': /* Non-extracting bracket */
1747 0 : bravalue = OP_BRA;
1748 0 : ptr++;
1749 0 : break;
1750 :
1751 0 : case '(':
1752 0 : bravalue = OP_COND; /* Conditional group */
1753 0 : if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754 : {
1755 0 : condref = *ptr - '0';
1756 0 : while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757 0 : if (condref == 0)
1758 : {
1759 0 : *errorptr = ERR35;
1760 0 : goto FAILED;
1761 : }
1762 0 : ptr++;
1763 : }
1764 0 : else ptr--;
1765 0 : break;
1766 :
1767 0 : case '=': /* Positive lookahead */
1768 0 : bravalue = OP_ASSERT;
1769 0 : ptr++;
1770 0 : break;
1771 :
1772 0 : case '!': /* Negative lookahead */
1773 0 : bravalue = OP_ASSERT_NOT;
1774 0 : ptr++;
1775 0 : break;
1776 :
1777 0 : case '<': /* Lookbehinds */
1778 0 : switch (*(++ptr))
1779 : {
1780 0 : case '=': /* Positive lookbehind */
1781 0 : bravalue = OP_ASSERTBACK;
1782 0 : ptr++;
1783 0 : break;
1784 :
1785 0 : case '!': /* Negative lookbehind */
1786 0 : bravalue = OP_ASSERTBACK_NOT;
1787 0 : ptr++;
1788 0 : break;
1789 :
1790 0 : default: /* Syntax error */
1791 0 : *errorptr = ERR24;
1792 0 : goto FAILED;
1793 : }
1794 0 : break;
1795 :
1796 0 : case '>': /* One-time brackets */
1797 0 : bravalue = OP_ONCE;
1798 0 : ptr++;
1799 0 : break;
1800 :
1801 0 : case 'R': /* Pattern recursion */
1802 0 : *code++ = OP_RECURSE;
1803 0 : ptr++;
1804 0 : continue;
1805 :
1806 0 : default: /* Option setting */
1807 0 : set = unset = 0;
1808 0 : optset = &set;
1809 :
1810 0 : while (*ptr != ')' && *ptr != ':')
1811 : {
1812 0 : switch (*ptr++)
1813 : {
1814 0 : case '-': optset = &unset; break;
1815 :
1816 0 : case 'i': *optset |= PCRE_CASELESS; break;
1817 0 : case 'm': *optset |= PCRE_MULTILINE; break;
1818 0 : case 's': *optset |= PCRE_DOTALL; break;
1819 0 : case 'x': *optset |= PCRE_EXTENDED; break;
1820 0 : case 'U': *optset |= PCRE_UNGREEDY; break;
1821 0 : case 'X': *optset |= PCRE_EXTRA; break;
1822 :
1823 0 : default:
1824 0 : *errorptr = ERR12;
1825 0 : goto FAILED;
1826 : }
1827 : }
1828 :
1829 : /* Set up the changed option bits, but don't change anything yet. */
1830 :
1831 0 : newoptions = (options | set) & (~unset);
1832 :
1833 : /* If the options ended with ')' this is not the start of a nested
1834 : group with option changes, so the options change at this level. At top
1835 : level there is nothing else to be done (the options will in fact have
1836 : been set from the start of compiling as a result of the first pass) but
1837 : at an inner level we must compile code to change the ims options if
1838 : necessary, and pass the new setting back so that it can be put at the
1839 : start of any following branches, and when this group ends, a resetting
1840 : item can be compiled. */
1841 :
1842 0 : if (*ptr == ')')
1843 : {
1844 0 : if ((options & PCRE_INGROUP) != 0 &&
1845 0 : (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846 : {
1847 0 : *code++ = OP_OPT;
1848 0 : *code++ = *optchanged = newoptions & PCRE_IMS;
1849 : }
1850 0 : options = newoptions; /* Change options at this level */
1851 0 : previous = NULL; /* This item can't be repeated */
1852 0 : continue; /* It is complete */
1853 : }
1854 :
1855 : /* If the options ended with ':' we are heading into a nested group
1856 : with possible change of options. Such groups are non-capturing and are
1857 : not assertions of any kind. All we need to do is skip over the ':';
1858 : the newoptions value is handled below. */
1859 :
1860 0 : bravalue = OP_BRA;
1861 0 : ptr++;
1862 : }
1863 : }
1864 :
1865 : /* Else we have a referencing group; adjust the opcode. */
1866 :
1867 : else
1868 : {
1869 0 : if (++(*brackets) > EXTRACT_MAX)
1870 : {
1871 0 : *errorptr = ERR13;
1872 0 : goto FAILED;
1873 : }
1874 0 : bravalue = OP_BRA + *brackets;
1875 : }
1876 :
1877 : /* Process nested bracketed re. Assertions may not be repeated, but other
1878 : kinds can be. We copy code into a non-register variable in order to be able
1879 : to pass its address because some compilers complain otherwise. Pass in a
1880 : new setting for the ims options if they have changed. */
1881 :
1882 0 : previous = (bravalue >= OP_ONCE)? code : NULL;
1883 0 : *code = bravalue;
1884 0 : tempcode = code;
1885 :
1886 0 : if (!compile_regex(
1887 : options | PCRE_INGROUP, /* Set for all nested groups */
1888 0 : ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889 : newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890 : brackets, /* Bracket level */
1891 : &tempcode, /* Where to put code (updated) */
1892 : &ptr, /* Input pointer (updated) */
1893 : errorptr, /* Where to put an error message */
1894 : (bravalue == OP_ASSERTBACK ||
1895 : bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896 : condref, /* Condition reference number */
1897 : &subreqchar, /* For possible last char */
1898 : &subcountlits, /* For literal count */
1899 : cd)) /* Tables block */
1900 0 : goto FAILED;
1901 :
1902 : /* At the end of compiling, code is still pointing to the start of the
1903 : group, while tempcode has been updated to point past the end of the group
1904 : and any option resetting that may follow it. The pattern pointer (ptr)
1905 : is on the bracket. */
1906 :
1907 : /* If this is a conditional bracket, check that there are no more than
1908 : two branches in the group. */
1909 :
1910 0 : if (bravalue == OP_COND)
1911 : {
1912 0 : uschar *tc = code;
1913 0 : condcount = 0;
1914 :
1915 : do {
1916 0 : condcount++;
1917 0 : tc += (tc[1] << 8) | tc[2];
1918 : }
1919 0 : while (*tc != OP_KET);
1920 :
1921 0 : if (condcount > 2)
1922 : {
1923 0 : *errorptr = ERR27;
1924 0 : goto FAILED;
1925 : }
1926 : }
1927 :
1928 : /* Handle updating of the required character. If the subpattern didn't
1929 : set one, leave it as it was. Otherwise, update it for normal brackets of
1930 : all kinds, forward assertions, and conditions with two branches. Don't
1931 : update the literal count for forward assertions, however. If the bracket
1932 : is followed by a quantifier with zero repeat, we have to back off. Hence
1933 : the definition of prevreqchar and subcountlits outside the main loop so
1934 : that they can be accessed for the back off. */
1935 :
1936 0 : if (subreqchar > 0 &&
1937 0 : (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938 0 : (bravalue == OP_COND && condcount == 2)))
1939 : {
1940 0 : prevreqchar = *reqchar;
1941 0 : *reqchar = subreqchar;
1942 0 : if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943 : }
1944 :
1945 : /* Now update the main code pointer to the end of the group. */
1946 :
1947 0 : code = tempcode;
1948 :
1949 : /* Error if hit end of pattern */
1950 :
1951 0 : if (*ptr != ')')
1952 : {
1953 0 : *errorptr = ERR14;
1954 0 : goto FAILED;
1955 : }
1956 0 : break;
1957 :
1958 : /* Check \ for being a real metacharacter; if not, fall through and handle
1959 : it as a data character at the start of a string. Escape items are checked
1960 : for validity in the pre-compiling pass. */
1961 :
1962 0 : case '\\':
1963 0 : tempptr = ptr;
1964 0 : c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1965 :
1966 : /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967 : are arranged to be the negation of the corresponding OP_values. For the
1968 : back references, the values are ESC_REF plus the reference number. Only
1969 : back references and those types that consume a character may be repeated.
1970 : We can test for values between ESC_b and ESC_Z for the latter; this may
1971 : have to change if any new ones are ever created. */
1972 :
1973 0 : if (c < 0)
1974 : {
1975 0 : if (-c >= ESC_REF)
1976 : {
1977 0 : previous = code;
1978 0 : *code++ = OP_REF;
1979 0 : *code++ = -c - ESC_REF;
1980 : }
1981 : else
1982 : {
1983 0 : previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1984 0 : *code++ = -c;
1985 : }
1986 0 : continue;
1987 : }
1988 :
1989 : /* Data character: reset and fall through */
1990 :
1991 0 : ptr = tempptr;
1992 0 : c = '\\';
1993 :
1994 : /* Handle a run of data characters until a metacharacter is encountered.
1995 : The first character is guaranteed not to be whitespace or # when the
1996 : extended flag is set. */
1997 :
1998 0 : NORMAL_CHAR:
1999 : default:
2000 0 : previous = code;
2001 0 : *code = OP_CHARS;
2002 0 : code += 2;
2003 0 : length = 0;
2004 :
2005 : do
2006 : {
2007 0 : if ((options & PCRE_EXTENDED) != 0)
2008 : {
2009 0 : if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010 0 : if (c == '#')
2011 : {
2012 : /* The space before the ; is to avoid a warning on a silly compiler
2013 : on the Macintosh. */
2014 0 : while ((c = *(++ptr)) != 0 && c != '\n') ;
2015 0 : if (c == 0) break;
2016 0 : continue;
2017 : }
2018 : }
2019 :
2020 : /* Backslash may introduce a data char or a metacharacter. Escaped items
2021 : are checked for validity in the pre-compiling pass. Stop the string
2022 : before a metaitem. */
2023 :
2024 0 : if (c == '\\')
2025 : {
2026 0 : tempptr = ptr;
2027 0 : c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028 0 : if (c < 0) { ptr = tempptr; break; }
2029 :
2030 : /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031 : two or more characters in the UTF-8 encoding. */
2032 :
2033 : #ifdef SUPPORT_UTF8
2034 : if (c > 127 && (options & PCRE_UTF8) != 0)
2035 : {
2036 : uschar buffer[8];
2037 : int len = ord2utf8(c, buffer);
2038 : for (c = 0; c < len; c++) *code++ = buffer[c];
2039 : length += len;
2040 : continue;
2041 : }
2042 : #endif
2043 : }
2044 :
2045 : /* Ordinary character or single-char escape */
2046 :
2047 0 : *code++ = c;
2048 0 : length++;
2049 : }
2050 :
2051 : /* This "while" is the end of the "do" above. */
2052 :
2053 0 : while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054 :
2055 : /* Update the last character and the count of literals */
2056 :
2057 0 : prevreqchar = (length > 1)? code[-2] : *reqchar;
2058 0 : *reqchar = code[-1];
2059 0 : *countlits += length;
2060 :
2061 : /* Compute the length and set it in the data vector, and advance to
2062 : the next state. */
2063 :
2064 0 : previous[1] = length;
2065 0 : if (length < MAXLIT) ptr--;
2066 0 : break;
2067 : }
2068 : } /* end of big loop */
2069 :
2070 : /* Control never reaches here by falling through, only by a goto for all the
2071 : error states. Pass back the position in the pattern so that it can be displayed
2072 : to the user for diagnosing the error. */
2073 :
2074 0 : FAILED:
2075 0 : *ptrptr = ptr;
2076 0 : return FALSE;
2077 : }
2078 :
2079 :
2080 :
2081 :
2082 : /*************************************************
2083 : * Compile sequence of alternatives *
2084 : *************************************************/
2085 :
2086 : /* On entry, ptr is pointing past the bracket character, but on return
2087 : it points to the closing bracket, or vertical bar, or end of string.
2088 : The code variable is pointing at the byte into which the BRA operator has been
2089 : stored. If the ims options are changed at the start (for a (?ims: group) or
2090 : during any branch, we need to insert an OP_OPT item at the start of every
2091 : following branch to ensure they get set correctly at run time, and also pass
2092 : the new options into every subsequent branch compile.
2093 :
2094 : Argument:
2095 : options the option bits
2096 : optchanged new ims options to set as if (?ims) were at the start, or -1
2097 : for no change
2098 : brackets -> int containing the number of extracting brackets used
2099 : codeptr -> the address of the current code pointer
2100 : ptrptr -> the address of the current pattern pointer
2101 : errorptr -> pointer to error message
2102 : lookbehind TRUE if this is a lookbehind assertion
2103 : condref >= 0 for OPT_CREF setting at start of conditional group
2104 : reqchar -> place to put the last required character, or a negative number
2105 : countlits -> place to put the shortest literal count of any branch
2106 : cd points to the data block with tables pointers
2107 :
2108 : Returns: TRUE on success
2109 : */
2110 :
2111 : static BOOL
2112 0 : compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113 : const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114 : int *reqchar, int *countlits, compile_data *cd)
2115 : {
2116 0 : const uschar *ptr = *ptrptr;
2117 0 : uschar *code = *codeptr;
2118 0 : uschar *last_branch = code;
2119 0 : uschar *start_bracket = code;
2120 0 : uschar *reverse_count = NULL;
2121 0 : int oldoptions = options & PCRE_IMS;
2122 : int branchreqchar, branchcountlits;
2123 :
2124 0 : *reqchar = -1;
2125 0 : *countlits = INT_MAX;
2126 0 : code += 3;
2127 :
2128 : /* At the start of a reference-based conditional group, insert the reference
2129 : number as an OP_CREF item. */
2130 :
2131 0 : if (condref >= 0)
2132 : {
2133 0 : *code++ = OP_CREF;
2134 0 : *code++ = condref;
2135 : }
2136 :
2137 : /* Loop for each alternative branch */
2138 :
2139 : for (;;)
2140 0 : {
2141 : int length;
2142 :
2143 : /* Handle change of options */
2144 :
2145 0 : if (optchanged >= 0)
2146 : {
2147 0 : *code++ = OP_OPT;
2148 0 : *code++ = optchanged;
2149 0 : options = (options & ~PCRE_IMS) | optchanged;
2150 : }
2151 :
2152 : /* Set up dummy OP_REVERSE if lookbehind assertion */
2153 :
2154 0 : if (lookbehind)
2155 : {
2156 0 : *code++ = OP_REVERSE;
2157 0 : reverse_count = code;
2158 0 : *code++ = 0;
2159 0 : *code++ = 0;
2160 : }
2161 :
2162 : /* Now compile the branch */
2163 :
2164 0 : if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165 : &branchreqchar, &branchcountlits, cd))
2166 : {
2167 0 : *ptrptr = ptr;
2168 0 : return FALSE;
2169 : }
2170 :
2171 : /* Fill in the length of the last branch */
2172 :
2173 0 : length = code - last_branch;
2174 0 : last_branch[1] = length >> 8;
2175 0 : last_branch[2] = length & 255;
2176 :
2177 : /* Save the last required character if all branches have the same; a current
2178 : value of -1 means unset, while -2 means "previous branch had no last required
2179 : char". */
2180 :
2181 0 : if (*reqchar != -2)
2182 : {
2183 0 : if (branchreqchar >= 0)
2184 : {
2185 0 : if (*reqchar == -1) *reqchar = branchreqchar;
2186 0 : else if (*reqchar != branchreqchar) *reqchar = -2;
2187 : }
2188 0 : else *reqchar = -2;
2189 : }
2190 :
2191 : /* Keep the shortest literal count */
2192 :
2193 0 : if (branchcountlits < *countlits) *countlits = branchcountlits;
2194 : DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195 :
2196 : /* If lookbehind, check that this branch matches a fixed-length string,
2197 : and put the length into the OP_REVERSE item. Temporarily mark the end of
2198 : the branch with OP_END. */
2199 :
2200 0 : if (lookbehind)
2201 : {
2202 0 : *code = OP_END;
2203 0 : length = find_fixedlength(last_branch, options);
2204 : DPRINTF(("fixed length = %d\n", length));
2205 0 : if (length < 0)
2206 : {
2207 0 : *errorptr = ERR25;
2208 0 : *ptrptr = ptr;
2209 0 : return FALSE;
2210 : }
2211 0 : reverse_count[0] = (length >> 8);
2212 0 : reverse_count[1] = length & 255;
2213 : }
2214 :
2215 : /* Reached end of expression, either ')' or end of pattern. Insert a
2216 : terminating ket and the length of the whole bracketed item, and return,
2217 : leaving the pointer at the terminating char. If any of the ims options
2218 : were changed inside the group, compile a resetting op-code following. */
2219 :
2220 0 : if (*ptr != '|')
2221 : {
2222 0 : length = code - start_bracket;
2223 0 : *code++ = OP_KET;
2224 0 : *code++ = length >> 8;
2225 0 : *code++ = length & 255;
2226 0 : if (optchanged >= 0)
2227 : {
2228 0 : *code++ = OP_OPT;
2229 0 : *code++ = oldoptions;
2230 : }
2231 0 : *codeptr = code;
2232 0 : *ptrptr = ptr;
2233 0 : return TRUE;
2234 : }
2235 :
2236 : /* Another branch follows; insert an "or" node and advance the pointer. */
2237 :
2238 0 : *code = OP_ALT;
2239 0 : last_branch = code;
2240 0 : code += 3;
2241 0 : ptr++;
2242 : }
2243 : /* Control never reaches here */
2244 : }
2245 :
2246 :
2247 :
2248 :
2249 : /*************************************************
2250 : * Find first significant op code *
2251 : *************************************************/
2252 :
2253 : /* This is called by several functions that scan a compiled expression looking
2254 : for a fixed first character, or an anchoring op code etc. It skips over things
2255 : that do not influence this. For one application, a change of caseless option is
2256 : important.
2257 :
2258 : Arguments:
2259 : code pointer to the start of the group
2260 : options pointer to external options
2261 : optbit the option bit whose changing is significant, or
2262 : zero if none are
2263 : optstop TRUE to return on option change, otherwise change the options
2264 : value and continue
2265 :
2266 : Returns: pointer to the first significant opcode
2267 : */
2268 :
2269 : static const uschar*
2270 0 : first_significant_code(const uschar *code, int *options, int optbit,
2271 : BOOL optstop)
2272 : {
2273 : for (;;)
2274 : {
2275 0 : switch ((int)*code)
2276 : {
2277 0 : case OP_OPT:
2278 0 : if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2279 : {
2280 0 : if (optstop) return code;
2281 0 : *options = (int)code[1];
2282 : }
2283 0 : code += 2;
2284 0 : break;
2285 :
2286 0 : case OP_CREF:
2287 0 : code += 2;
2288 0 : break;
2289 :
2290 0 : case OP_WORD_BOUNDARY:
2291 : case OP_NOT_WORD_BOUNDARY:
2292 0 : code++;
2293 0 : break;
2294 :
2295 0 : case OP_ASSERT_NOT:
2296 : case OP_ASSERTBACK:
2297 : case OP_ASSERTBACK_NOT:
2298 0 : do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2299 0 : code += 3;
2300 0 : break;
2301 :
2302 0 : default:
2303 0 : return code;
2304 : }
2305 : }
2306 : /* Control never reaches here */
2307 : }
2308 :
2309 :
2310 :
2311 :
2312 : /*************************************************
2313 : * Check for anchored expression *
2314 : *************************************************/
2315 :
2316 : /* Try to find out if this is an anchored regular expression. Consider each
2317 : alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318 : all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319 : it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320 : counts, since OP_CIRC can match in the middle.
2321 :
2322 : A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323 : because that will try the rest of the pattern at all possible matching points,
2324 : so there is no point trying them again.
2325 :
2326 : Arguments:
2327 : code points to start of expression (the bracket)
2328 : options points to the options setting
2329 :
2330 : Returns: TRUE or FALSE
2331 : */
2332 :
2333 : static BOOL
2334 0 : is_anchored(register const uschar *code, int *options)
2335 : {
2336 : do {
2337 0 : const uschar *scode = first_significant_code(code + 3, options,
2338 : PCRE_MULTILINE, FALSE);
2339 0 : register int op = *scode;
2340 0 : if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341 0 : { if (!is_anchored(scode, options)) return FALSE; }
2342 0 : else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343 0 : (*options & PCRE_DOTALL) != 0)
2344 0 : { if (scode[1] != OP_ANY) return FALSE; }
2345 0 : else if (op != OP_SOD &&
2346 0 : ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2347 0 : return FALSE;
2348 0 : code += (code[1] << 8) + code[2];
2349 : }
2350 0 : while (*code == OP_ALT);
2351 0 : return TRUE;
2352 : }
2353 :
2354 :
2355 :
2356 : /*************************************************
2357 : * Check for starting with ^ or .* *
2358 : *************************************************/
2359 :
2360 : /* This is called to find out if every branch starts with ^ or .* so that
2361 : "first char" processing can be done to speed things up in multiline
2362 : matching and for non-DOTALL patterns that start with .* (which must start at
2363 : the beginning or after \n).
2364 :
2365 : Argument: points to start of expression (the bracket)
2366 : Returns: TRUE or FALSE
2367 : */
2368 :
2369 : static BOOL
2370 0 : is_startline(const uschar *code)
2371 : {
2372 : do {
2373 0 : const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374 0 : register int op = *scode;
2375 0 : if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376 0 : { if (!is_startline(scode)) return FALSE; }
2377 0 : else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378 0 : { if (scode[1] != OP_ANY) return FALSE; }
2379 0 : else if (op != OP_CIRC) return FALSE;
2380 0 : code += (code[1] << 8) + code[2];
2381 : }
2382 0 : while (*code == OP_ALT);
2383 0 : return TRUE;
2384 : }
2385 :
2386 :
2387 :
2388 : /*************************************************
2389 : * Check for fixed first char *
2390 : *************************************************/
2391 :
2392 : /* Try to find out if there is a fixed first character. This is called for
2393 : unanchored expressions, as it speeds up their processing quite considerably.
2394 : Consider each alternative branch. If they all start with the same char, or with
2395 : a bracket all of whose alternatives start with the same char (recurse ad lib),
2396 : then we return that char, otherwise -1.
2397 :
2398 : Arguments:
2399 : code points to start of expression (the bracket)
2400 : options pointer to the options (used to check casing changes)
2401 :
2402 : Returns: -1 or the fixed first char
2403 : */
2404 :
2405 : static int
2406 0 : find_firstchar(const uschar *code, int *options)
2407 : {
2408 0 : register int c = -1;
2409 : do {
2410 : int d;
2411 0 : const uschar *scode = first_significant_code(code + 3, options,
2412 : PCRE_CASELESS, TRUE);
2413 0 : register int op = *scode;
2414 :
2415 0 : if (op >= OP_BRA) op = OP_BRA;
2416 :
2417 0 : switch(op)
2418 : {
2419 0 : default:
2420 0 : return -1;
2421 :
2422 0 : case OP_BRA:
2423 : case OP_ASSERT:
2424 : case OP_ONCE:
2425 : case OP_COND:
2426 0 : if ((d = find_firstchar(scode, options)) < 0) return -1;
2427 0 : if (c < 0) c = d; else if (c != d) return -1;
2428 0 : break;
2429 :
2430 0 : case OP_EXACT: /* Fall through */
2431 0 : scode++;
2432 :
2433 0 : case OP_CHARS: /* Fall through */
2434 0 : scode++;
2435 :
2436 0 : case OP_PLUS:
2437 : case OP_MINPLUS:
2438 0 : if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2439 0 : break;
2440 : }
2441 :
2442 0 : code += (code[1] << 8) + code[2];
2443 : }
2444 0 : while (*code == OP_ALT);
2445 0 : return c;
2446 : }
2447 :
2448 :
2449 :
2450 :
2451 :
2452 : /*************************************************
2453 : * Compile a Regular Expression *
2454 : *************************************************/
2455 :
2456 : /* This function takes a string and returns a pointer to a block of store
2457 : holding a compiled version of the expression.
2458 :
2459 : Arguments:
2460 : pattern the regular expression
2461 : options various option bits
2462 : errorptr pointer to pointer to error text
2463 : erroroffset ptr offset in pattern where error was detected
2464 : tables pointer to character tables or NULL
2465 :
2466 : Returns: pointer to compiled data block, or NULL on error,
2467 : with errorptr and erroroffset set
2468 : */
2469 :
2470 : pcre *
2471 0 : pcre_compile(const char *pattern, int options, const char **errorptr,
2472 : int *erroroffset, const unsigned char *tables)
2473 : {
2474 : real_pcre *re;
2475 0 : int length = 3; /* For initial BRA plus length */
2476 : int runlength;
2477 : int c, reqchar, countlits;
2478 0 : int bracount = 0;
2479 0 : int top_backref = 0;
2480 0 : int branch_extra = 0;
2481 : int branch_newextra;
2482 0 : unsigned int brastackptr = 0;
2483 : size_t size;
2484 : uschar *code;
2485 : const uschar *ptr;
2486 : compile_data compile_block;
2487 : int brastack[BRASTACK_SIZE];
2488 : uschar bralenstack[BRASTACK_SIZE];
2489 0 : const size_t pattern_length = strlen(pattern);
2490 :
2491 : #ifdef DEBUG
2492 : uschar *code_base, *code_end;
2493 : #endif
2494 :
2495 : /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2496 :
2497 : #ifndef SUPPORT_UTF8
2498 0 : if ((options & PCRE_UTF8) != 0)
2499 : {
2500 0 : *errorptr = ERR32;
2501 0 : return NULL;
2502 : }
2503 : #endif
2504 :
2505 : /* We can't pass back an error message if errorptr is NULL; I guess the best we
2506 : can do is just return NULL. */
2507 :
2508 0 : if (errorptr == NULL) return NULL;
2509 0 : *errorptr = NULL;
2510 :
2511 : /* However, we can give a message for this error */
2512 :
2513 0 : if (erroroffset == NULL)
2514 : {
2515 0 : *errorptr = ERR16;
2516 0 : return NULL;
2517 : }
2518 0 : *erroroffset = 0;
2519 :
2520 0 : if ((options & ~PUBLIC_OPTIONS) != 0)
2521 : {
2522 0 : *errorptr = ERR17;
2523 0 : return NULL;
2524 : }
2525 :
2526 : /* Set up pointers to the individual character tables */
2527 :
2528 0 : if (tables == NULL) tables = pcre_default_tables;
2529 0 : compile_block.lcc = tables + lcc_offset;
2530 0 : compile_block.fcc = tables + fcc_offset;
2531 0 : compile_block.cbits = tables + cbits_offset;
2532 0 : compile_block.ctypes = tables + ctypes_offset;
2533 :
2534 : /* Reflect pattern for debugging output */
2535 :
2536 : DPRINTF(("------------------------------------------------------------------\n"));
2537 : DPRINTF(("%s\n", pattern));
2538 :
2539 : /* The first thing to do is to make a pass over the pattern to compute the
2540 : amount of store required to hold the compiled code. This does not have to be
2541 : perfect as long as errors are overestimates. At the same time we can detect any
2542 : internal flag settings. Make an attempt to correct for any counted white space
2543 : if an "extended" flag setting appears late in the pattern. We can't be so
2544 : clever for #-comments. */
2545 :
2546 0 : ptr = (const uschar *)(pattern - 1);
2547 0 : while ((c = *(++ptr)) != 0)
2548 : {
2549 : int min, max;
2550 : int class_charcount;
2551 :
2552 0 : if ((options & PCRE_EXTENDED) != 0)
2553 : {
2554 0 : if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2555 0 : if (c == '#')
2556 : {
2557 : /* The space before the ; is to avoid a warning on a silly compiler
2558 : on the Macintosh. */
2559 0 : while ((c = *(++ptr)) != 0 && c != '\n') ;
2560 0 : continue;
2561 : }
2562 : }
2563 :
2564 0 : switch(c)
2565 : {
2566 : /* A backslashed item may be an escaped "normal" character or a
2567 : character type. For a "normal" character, put the pointers and
2568 : character back so that tests for whitespace etc. in the input
2569 : are done correctly. */
2570 :
2571 0 : case '\\':
2572 : {
2573 0 : const uschar *save_ptr = ptr;
2574 0 : c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2575 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2576 0 : if (c >= 0)
2577 : {
2578 0 : ptr = save_ptr;
2579 0 : c = '\\';
2580 0 : goto NORMAL_CHAR;
2581 : }
2582 : }
2583 0 : length++;
2584 :
2585 : /* A back reference needs an additional char, plus either one or 5
2586 : bytes for a repeat. We also need to keep the value of the highest
2587 : back reference. */
2588 :
2589 0 : if (c <= -ESC_REF)
2590 : {
2591 0 : int refnum = -c - ESC_REF;
2592 0 : if (refnum > top_backref) top_backref = refnum;
2593 0 : length++; /* For single back reference */
2594 0 : if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2595 : {
2596 0 : ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2597 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2598 0 : if ((min == 0 && (max == 1 || max == -1)) ||
2599 0 : (min == 1 && max == -1))
2600 0 : length++;
2601 0 : else length += 5;
2602 0 : if (ptr[1] == '?') ptr++;
2603 : }
2604 : }
2605 0 : continue;
2606 :
2607 0 : case '^':
2608 : case '.':
2609 : case '$':
2610 : case '*': /* These repeats won't be after brackets; */
2611 : case '+': /* those are handled separately */
2612 : case '?':
2613 0 : length++;
2614 0 : continue;
2615 :
2616 : /* This covers the cases of repeats after a single char, metachar, class,
2617 : or back reference. */
2618 :
2619 0 : case '{':
2620 0 : if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2621 0 : ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2622 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2623 0 : if ((min == 0 && (max == 1 || max == -1)) ||
2624 0 : (min == 1 && max == -1))
2625 0 : length++;
2626 : else
2627 : {
2628 0 : length--; /* Uncount the original char or metachar */
2629 0 : if (min == 1) length++; else if (min > 0) length += 4;
2630 0 : if (max > 0) length += 4; else length += 2;
2631 : }
2632 0 : if (ptr[1] == '?') ptr++;
2633 0 : continue;
2634 :
2635 : /* An alternation contains an offset to the next branch or ket. If any ims
2636 : options changed in the previous branch(es), and/or if we are in a
2637 : lookbehind assertion, extra space will be needed at the start of the
2638 : branch. This is handled by branch_extra. */
2639 :
2640 0 : case '|':
2641 0 : length += 3 + branch_extra;
2642 0 : continue;
2643 :
2644 : /* A character class uses 33 characters. Don't worry about character types
2645 : that aren't allowed in classes - they'll get picked up during the compile.
2646 : A character class that contains only one character uses 2 or 3 bytes,
2647 : depending on whether it is negated or not. Notice this where we can. */
2648 :
2649 0 : case '[':
2650 0 : class_charcount = 0;
2651 0 : if (*(++ptr) == '^') ptr++;
2652 : do
2653 : {
2654 0 : if (*ptr == '\\')
2655 : {
2656 0 : int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2657 : &compile_block);
2658 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2659 0 : if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2660 : }
2661 0 : else class_charcount++;
2662 0 : ptr++;
2663 0 : if (*ptr == 0)
2664 : {
2665 0 : *errorptr = ERR6;
2666 0 : goto PCRE_ERROR_RETURN;
2667 : }
2668 : }
2669 0 : while (*ptr != ']');
2670 :
2671 : /* Repeats for negated single chars are handled by the general code */
2672 :
2673 0 : if (class_charcount == 1) length += 3; else
2674 : {
2675 0 : length += 33;
2676 :
2677 : /* A repeat needs either 1 or 5 bytes. */
2678 :
2679 0 : if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2680 : {
2681 0 : ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2682 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2683 0 : if ((min == 0 && (max == 1 || max == -1)) ||
2684 0 : (min == 1 && max == -1))
2685 0 : length++;
2686 0 : else length += 5;
2687 0 : if (ptr[1] == '?') ptr++;
2688 : }
2689 : }
2690 0 : continue;
2691 :
2692 : /* Brackets may be genuine groups or special things */
2693 :
2694 0 : case '(':
2695 0 : branch_newextra = 0;
2696 :
2697 : /* Handle special forms of bracket, which all start (? */
2698 :
2699 0 : if (ptr[1] == '?')
2700 : {
2701 : int set, unset;
2702 : int *optset;
2703 :
2704 0 : switch (c = ptr[2])
2705 : {
2706 : /* Skip over comments entirely */
2707 0 : case '#':
2708 0 : ptr += 3;
2709 0 : while (*ptr != 0 && *ptr != ')') ptr++;
2710 0 : if (*ptr == 0)
2711 : {
2712 0 : *errorptr = ERR18;
2713 0 : goto PCRE_ERROR_RETURN;
2714 : }
2715 0 : continue;
2716 :
2717 : /* Non-referencing groups and lookaheads just move the pointer on, and
2718 : then behave like a non-special bracket, except that they don't increment
2719 : the count of extracting brackets. Ditto for the "once only" bracket,
2720 : which is in Perl from version 5.005. */
2721 :
2722 0 : case ':':
2723 : case '=':
2724 : case '!':
2725 : case '>':
2726 0 : ptr += 2;
2727 0 : break;
2728 :
2729 : /* A recursive call to the regex is an extension, to provide the
2730 : facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2731 :
2732 0 : case 'R':
2733 0 : if (ptr[3] != ')')
2734 : {
2735 0 : *errorptr = ERR29;
2736 0 : goto PCRE_ERROR_RETURN;
2737 : }
2738 0 : ptr += 3;
2739 0 : length += 1;
2740 0 : break;
2741 :
2742 : /* Lookbehinds are in Perl from version 5.005 */
2743 :
2744 0 : case '<':
2745 0 : if (ptr[3] == '=' || ptr[3] == '!')
2746 : {
2747 0 : ptr += 3;
2748 0 : branch_newextra = 3;
2749 0 : length += 3; /* For the first branch */
2750 0 : break;
2751 : }
2752 0 : *errorptr = ERR24;
2753 0 : goto PCRE_ERROR_RETURN;
2754 :
2755 : /* Conditionals are in Perl from version 5.005. The bracket must either
2756 : be followed by a number (for bracket reference) or by an assertion
2757 : group. */
2758 :
2759 0 : case '(':
2760 0 : if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2761 : {
2762 0 : ptr += 4;
2763 0 : length += 2;
2764 0 : while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2765 0 : if (*ptr != ')')
2766 : {
2767 0 : *errorptr = ERR26;
2768 0 : goto PCRE_ERROR_RETURN;
2769 : }
2770 : }
2771 : else /* An assertion must follow */
2772 : {
2773 0 : ptr++; /* Can treat like ':' as far as spacing is concerned */
2774 0 : if (ptr[2] != '?' ||
2775 0 : (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2776 : {
2777 0 : ptr += 2; /* To get right offset in message */
2778 0 : *errorptr = ERR28;
2779 0 : goto PCRE_ERROR_RETURN;
2780 : }
2781 : }
2782 0 : break;
2783 :
2784 : /* Else loop checking valid options until ) is met. Anything else is an
2785 : error. If we are without any brackets, i.e. at top level, the settings
2786 : act as if specified in the options, so massage the options immediately.
2787 : This is for backward compatibility with Perl 5.004. */
2788 :
2789 0 : default:
2790 0 : set = unset = 0;
2791 0 : optset = &set;
2792 0 : ptr += 2;
2793 :
2794 0 : for (;; ptr++)
2795 : {
2796 0 : c = *ptr;
2797 0 : switch (c)
2798 : {
2799 0 : case 'i':
2800 0 : *optset |= PCRE_CASELESS;
2801 0 : continue;
2802 :
2803 0 : case 'm':
2804 0 : *optset |= PCRE_MULTILINE;
2805 0 : continue;
2806 :
2807 0 : case 's':
2808 0 : *optset |= PCRE_DOTALL;
2809 0 : continue;
2810 :
2811 0 : case 'x':
2812 0 : *optset |= PCRE_EXTENDED;
2813 0 : continue;
2814 :
2815 0 : case 'X':
2816 0 : *optset |= PCRE_EXTRA;
2817 0 : continue;
2818 :
2819 0 : case 'U':
2820 0 : *optset |= PCRE_UNGREEDY;
2821 0 : continue;
2822 :
2823 0 : case '-':
2824 0 : optset = &unset;
2825 0 : continue;
2826 :
2827 : /* A termination by ')' indicates an options-setting-only item;
2828 : this is global at top level; otherwise nothing is done here and
2829 : it is handled during the compiling process on a per-bracket-group
2830 : basis. */
2831 :
2832 0 : case ')':
2833 0 : if (brastackptr == 0)
2834 : {
2835 0 : options = (options | set) & (~unset);
2836 0 : set = unset = 0; /* To save length */
2837 : }
2838 : /* Fall through */
2839 :
2840 : /* A termination by ':' indicates the start of a nested group with
2841 : the given options set. This is again handled at compile time, but
2842 : we must allow for compiled space if any of the ims options are
2843 : set. We also have to allow for resetting space at the end of
2844 : the group, which is why 4 is added to the length and not just 2.
2845 : If there are several changes of options within the same group, this
2846 : will lead to an over-estimate on the length, but this shouldn't
2847 : matter very much. We also have to allow for resetting options at
2848 : the start of any alternations, which we do by setting
2849 : branch_newextra to 2. Finally, we record whether the case-dependent
2850 : flag ever changes within the regex. This is used by the "required
2851 : character" code. */
2852 :
2853 : case ':':
2854 0 : if (((set|unset) & PCRE_IMS) != 0)
2855 : {
2856 0 : length += 4;
2857 0 : branch_newextra = 2;
2858 0 : if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2859 : }
2860 0 : goto END_OPTIONS;
2861 :
2862 : /* Unrecognized option character */
2863 :
2864 0 : default:
2865 0 : *errorptr = ERR12;
2866 0 : goto PCRE_ERROR_RETURN;
2867 : }
2868 : }
2869 :
2870 : /* If we hit a closing bracket, that's it - this is a freestanding
2871 : option-setting. We need to ensure that branch_extra is updated if
2872 : necessary. The only values branch_newextra can have here are 0 or 2.
2873 : If the value is 2, then branch_extra must either be 2 or 5, depending
2874 : on whether this is a lookbehind group or not. */
2875 :
2876 0 : END_OPTIONS:
2877 0 : if (c == ')')
2878 : {
2879 0 : if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2880 0 : branch_extra += branch_newextra;
2881 0 : continue;
2882 : }
2883 :
2884 : /* If options were terminated by ':' control comes here. Fall through
2885 : to handle the group below. */
2886 : }
2887 : }
2888 :
2889 : /* Extracting brackets must be counted so we can process escapes in a
2890 : Perlish way. */
2891 :
2892 0 : else bracount++;
2893 :
2894 : /* Non-special forms of bracket. Save length for computing whole length
2895 : at end if there's a repeat that requires duplication of the group. Also
2896 : save the current value of branch_extra, and start the new group with
2897 : the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2898 : for a lookbehind assertion. */
2899 :
2900 0 : if (brastackptr >= sizeof(brastack)/sizeof(int))
2901 : {
2902 0 : *errorptr = ERR19;
2903 0 : goto PCRE_ERROR_RETURN;
2904 : }
2905 :
2906 0 : bralenstack[brastackptr] = branch_extra;
2907 0 : branch_extra = branch_newextra;
2908 :
2909 0 : brastack[brastackptr++] = length;
2910 0 : length += 3;
2911 0 : continue;
2912 :
2913 : /* Handle ket. Look for subsequent max/min; for certain sets of values we
2914 : have to replicate this bracket up to that many times. If brastackptr is
2915 : 0 this is an unmatched bracket which will generate an error, but take care
2916 : not to try to access brastack[-1] when computing the length and restoring
2917 : the branch_extra value. */
2918 :
2919 0 : case ')':
2920 0 : length += 3;
2921 : {
2922 0 : int minval = 1;
2923 0 : int maxval = 1;
2924 : int duplength;
2925 :
2926 0 : if (brastackptr > 0)
2927 : {
2928 0 : duplength = length - brastack[--brastackptr];
2929 0 : branch_extra = bralenstack[brastackptr];
2930 : }
2931 0 : else duplength = 0;
2932 :
2933 : /* Leave ptr at the final char; for read_repeat_counts this happens
2934 : automatically; for the others we need an increment. */
2935 :
2936 0 : if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2937 : {
2938 0 : ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2939 : &compile_block);
2940 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2941 : }
2942 0 : else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2943 0 : else if (c == '+') { maxval = -1; ptr++; }
2944 0 : else if (c == '?') { minval = 0; ptr++; }
2945 :
2946 : /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2947 : group, and if the maximum is greater than zero, we have to replicate
2948 : maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2949 : bracket set - hence the 7. */
2950 :
2951 0 : if (minval == 0)
2952 : {
2953 0 : length++;
2954 0 : if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2955 : }
2956 :
2957 : /* When the minimum is greater than zero, 1 we have to replicate up to
2958 : minval-1 times, with no additions required in the copies. Then, if
2959 : there is a limited maximum we have to replicate up to maxval-1 times
2960 : allowing for a BRAZERO item before each optional copy and nesting
2961 : brackets for all but one of the optional copies. */
2962 :
2963 : else
2964 : {
2965 0 : length += (minval - 1) * duplength;
2966 0 : if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2967 0 : length += (maxval - minval) * (duplength + 7) - 6;
2968 : }
2969 : }
2970 0 : continue;
2971 :
2972 : /* Non-special character. For a run of such characters the length required
2973 : is the number of characters + 2, except that the maximum run length is 255.
2974 : We won't get a skipped space or a non-data escape or the start of a #
2975 : comment as the first character, so the length can't be zero. */
2976 :
2977 0 : NORMAL_CHAR:
2978 : default:
2979 0 : length += 2;
2980 0 : runlength = 0;
2981 : do
2982 : {
2983 0 : if ((options & PCRE_EXTENDED) != 0)
2984 : {
2985 0 : if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2986 0 : if (c == '#')
2987 : {
2988 : /* The space before the ; is to avoid a warning on a silly compiler
2989 : on the Macintosh. */
2990 0 : while ((c = *(++ptr)) != 0 && c != '\n') ;
2991 0 : continue;
2992 : }
2993 : }
2994 :
2995 : /* Backslash may introduce a data char or a metacharacter; stop the
2996 : string before the latter. */
2997 :
2998 0 : if (c == '\\')
2999 : {
3000 0 : const uschar *saveptr = ptr;
3001 0 : c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3002 : &compile_block);
3003 0 : if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3004 0 : if (c < 0) { ptr = saveptr; break; }
3005 :
3006 : #ifdef SUPPORT_UTF8
3007 : if (c > 127 && (options & PCRE_UTF8) != 0)
3008 : {
3009 : int i;
3010 : for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3011 : if (c <= utf8_table1[i]) break;
3012 : runlength += i;
3013 : }
3014 : #endif
3015 : }
3016 :
3017 : /* Ordinary character or single-char escape */
3018 :
3019 0 : runlength++;
3020 :
3021 0 : if ((const char *)ptr > pattern + pattern_length)
3022 : {
3023 0 : *errorptr = "internal error";
3024 0 : goto PCRE_ERROR_RETURN;
3025 : }
3026 : }
3027 :
3028 : /* This "while" is the end of the "do" above. */
3029 :
3030 0 : while (runlength < MAXLIT &&
3031 0 : (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3032 :
3033 0 : ptr--;
3034 0 : length += runlength;
3035 0 : continue;
3036 : }
3037 : }
3038 :
3039 0 : length += 4; /* For final KET and END */
3040 :
3041 0 : if (length > 65539)
3042 : {
3043 0 : *errorptr = ERR20;
3044 0 : return NULL;
3045 : }
3046 :
3047 : /* Compute the size of data block needed and get it, either from malloc or
3048 : externally provided function. We specify "code[0]" in the offsetof() expression
3049 : rather than just "code", because it has been reported that one broken compiler
3050 : fails on "code" because it is also an independent variable. It should make no
3051 : difference to the value of the offsetof(). */
3052 :
3053 0 : size = length + offsetof(real_pcre, code[0]);
3054 0 : re = (real_pcre *)(pcre_malloc)(size);
3055 :
3056 0 : if (re == NULL)
3057 : {
3058 0 : *errorptr = ERR21;
3059 0 : return NULL;
3060 : }
3061 :
3062 : /* Put in the magic number, and save the size, options, and table pointer */
3063 :
3064 0 : re->magic_number = MAGIC_NUMBER;
3065 0 : re->size = size;
3066 0 : re->options = options;
3067 0 : re->tables = tables;
3068 :
3069 : /* Set up a starting, non-extracting bracket, then compile the expression. On
3070 : error, *errorptr will be set non-NULL, so we don't need to look at the result
3071 : of the function here. */
3072 :
3073 0 : ptr = (const uschar *)pattern;
3074 0 : code = re->code;
3075 0 : *code = OP_BRA;
3076 0 : bracount = 0;
3077 0 : (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3078 : &reqchar, &countlits, &compile_block);
3079 0 : re->top_bracket = bracount;
3080 0 : re->top_backref = top_backref;
3081 :
3082 : /* If not reached end of pattern on success, there's an excess bracket. */
3083 :
3084 0 : if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3085 :
3086 : /* Fill in the terminating state and check for disastrous overflow, but
3087 : if debugging, leave the test till after things are printed out. */
3088 :
3089 0 : *code++ = OP_END;
3090 :
3091 : #ifndef DEBUG
3092 0 : if (code - re->code > length) *errorptr = ERR23;
3093 : #endif
3094 :
3095 : /* Give an error if there's back reference to a non-existent capturing
3096 : subpattern. */
3097 :
3098 0 : if (top_backref > re->top_bracket) *errorptr = ERR15;
3099 :
3100 : /* Failed to compile */
3101 :
3102 0 : if (*errorptr != NULL)
3103 : {
3104 0 : (pcre_free)(re);
3105 0 : PCRE_ERROR_RETURN:
3106 0 : *erroroffset = ptr - (const uschar *)pattern;
3107 0 : return NULL;
3108 : }
3109 :
3110 : /* If the anchored option was not passed, set flag if we can determine that the
3111 : pattern is anchored by virtue of ^ characters or \A or anything else (such as
3112 : starting with .* when DOTALL is set).
3113 :
3114 : Otherwise, see if we can determine what the first character has to be, because
3115 : that speeds up unanchored matches no end. If not, see if we can set the
3116 : PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3117 : start with ^. and also when all branches start with .* for non-DOTALL matches.
3118 : */
3119 :
3120 0 : if ((options & PCRE_ANCHORED) == 0)
3121 : {
3122 0 : int temp_options = options;
3123 0 : if (is_anchored(re->code, &temp_options))
3124 0 : re->options |= PCRE_ANCHORED;
3125 : else
3126 : {
3127 0 : int ch = find_firstchar(re->code, &temp_options);
3128 0 : if (ch >= 0)
3129 : {
3130 0 : re->first_char = ch;
3131 0 : re->options |= PCRE_FIRSTSET;
3132 : }
3133 0 : else if (is_startline(re->code))
3134 0 : re->options |= PCRE_STARTLINE;
3135 : }
3136 : }
3137 :
3138 : /* Save the last required character if there are at least two literal
3139 : characters on all paths, or if there is no first character setting. */
3140 :
3141 0 : if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3142 : {
3143 0 : re->req_char = reqchar;
3144 0 : re->options |= PCRE_REQCHSET;
3145 : }
3146 :
3147 : /* Print out the compiled data for debugging */
3148 :
3149 : #ifdef DEBUG
3150 :
3151 : printf("Length = %d top_bracket = %d top_backref = %d\n",
3152 : length, re->top_bracket, re->top_backref);
3153 :
3154 : if (re->options != 0)
3155 : {
3156 : printf("%s%s%s%s%s%s%s%s%s\n",
3157 : ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3158 : ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3159 : ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3160 : ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3161 : ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3162 : ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3163 : ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3164 : ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3165 : ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3166 : }
3167 :
3168 : if ((re->options & PCRE_FIRSTSET) != 0)
3169 : {
3170 : if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3171 : else printf("First char = \\x%02x\n", re->first_char);
3172 : }
3173 :
3174 : if ((re->options & PCRE_REQCHSET) != 0)
3175 : {
3176 : if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3177 : else printf("Req char = \\x%02x\n", re->req_char);
3178 : }
3179 :
3180 : code_end = code;
3181 : code_base = code = re->code;
3182 :
3183 : while (code < code_end)
3184 : {
3185 : int charlength;
3186 :
3187 : printf("%3d ", code - code_base);
3188 :
3189 : if (*code >= OP_BRA)
3190 : {
3191 : printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3192 : code += 2;
3193 : }
3194 :
3195 : else switch(*code)
3196 : {
3197 : case OP_OPT:
3198 : printf(" %.2x %s", code[1], OP_names[*code]);
3199 : code++;
3200 : break;
3201 :
3202 : case OP_COND:
3203 : printf("%3d Cond", (code[1] << 8) + code[2]);
3204 : code += 2;
3205 : break;
3206 :
3207 : case OP_CREF:
3208 : printf(" %.2d %s", code[1], OP_names[*code]);
3209 : code++;
3210 : break;
3211 :
3212 : case OP_CHARS:
3213 : charlength = *(++code);
3214 : printf("%3d ", charlength);
3215 : while (charlength-- > 0)
3216 : if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3217 : break;
3218 :
3219 : case OP_KETRMAX:
3220 : case OP_KETRMIN:
3221 : case OP_ALT:
3222 : case OP_KET:
3223 : case OP_ASSERT:
3224 : case OP_ASSERT_NOT:
3225 : case OP_ASSERTBACK:
3226 : case OP_ASSERTBACK_NOT:
3227 : case OP_ONCE:
3228 : printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3229 : code += 2;
3230 : break;
3231 :
3232 : case OP_REVERSE:
3233 : printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3234 : code += 2;
3235 : break;
3236 :
3237 : case OP_STAR:
3238 : case OP_MINSTAR:
3239 : case OP_PLUS:
3240 : case OP_MINPLUS:
3241 : case OP_QUERY:
3242 : case OP_MINQUERY:
3243 : case OP_TYPESTAR:
3244 : case OP_TYPEMINSTAR:
3245 : case OP_TYPEPLUS:
3246 : case OP_TYPEMINPLUS:
3247 : case OP_TYPEQUERY:
3248 : case OP_TYPEMINQUERY:
3249 : if (*code >= OP_TYPESTAR)
3250 : printf(" %s", OP_names[code[1]]);
3251 : else if (isprint(c = code[1])) printf(" %c", c);
3252 : else printf(" \\x%02x", c);
3253 : printf("%s", OP_names[*code++]);
3254 : break;
3255 :
3256 : case OP_EXACT:
3257 : case OP_UPTO:
3258 : case OP_MINUPTO:
3259 : if (isprint(c = code[3])) printf(" %c{", c);
3260 : else printf(" \\x%02x{", c);
3261 : if (*code != OP_EXACT) printf("0,");
3262 : printf("%d}", (code[1] << 8) + code[2]);
3263 : if (*code == OP_MINUPTO) printf("?");
3264 : code += 3;
3265 : break;
3266 :
3267 : case OP_TYPEEXACT:
3268 : case OP_TYPEUPTO:
3269 : case OP_TYPEMINUPTO:
3270 : printf(" %s{", OP_names[code[3]]);
3271 : if (*code != OP_TYPEEXACT) printf(",");
3272 : printf("%d}", (code[1] << 8) + code[2]);
3273 : if (*code == OP_TYPEMINUPTO) printf("?");
3274 : code += 3;
3275 : break;
3276 :
3277 : case OP_NOT:
3278 : if (isprint(c = *(++code))) printf(" [^%c]", c);
3279 : else printf(" [^\\x%02x]", c);
3280 : break;
3281 :
3282 : case OP_NOTSTAR:
3283 : case OP_NOTMINSTAR:
3284 : case OP_NOTPLUS:
3285 : case OP_NOTMINPLUS:
3286 : case OP_NOTQUERY:
3287 : case OP_NOTMINQUERY:
3288 : if (isprint(c = code[1])) printf(" [^%c]", c);
3289 : else printf(" [^\\x%02x]", c);
3290 : printf("%s", OP_names[*code++]);
3291 : break;
3292 :
3293 : case OP_NOTEXACT:
3294 : case OP_NOTUPTO:
3295 : case OP_NOTMINUPTO:
3296 : if (isprint(c = code[3])) printf(" [^%c]{", c);
3297 : else printf(" [^\\x%02x]{", c);
3298 : if (*code != OP_NOTEXACT) printf(",");
3299 : printf("%d}", (code[1] << 8) + code[2]);
3300 : if (*code == OP_NOTMINUPTO) printf("?");
3301 : code += 3;
3302 : break;
3303 :
3304 : case OP_REF:
3305 : printf(" \\%d", *(++code));
3306 : code ++;
3307 : goto CLASS_REF_REPEAT;
3308 :
3309 : case OP_CLASS:
3310 : {
3311 : int i, min, max;
3312 : code++;
3313 : printf(" [");
3314 :
3315 : for (i = 0; i < 256; i++)
3316 : {
3317 : if ((code[i/8] & (1 << (i&7))) != 0)
3318 : {
3319 : int j;
3320 : for (j = i+1; j < 256; j++)
3321 : if ((code[j/8] & (1 << (j&7))) == 0) break;
3322 : if (i == '-' || i == ']') printf("\\");
3323 : if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3324 : if (--j > i)
3325 : {
3326 : printf("-");
3327 : if (j == '-' || j == ']') printf("\\");
3328 : if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3329 : }
3330 : i = j;
3331 : }
3332 : }
3333 : printf("]");
3334 : code += 32;
3335 :
3336 : CLASS_REF_REPEAT:
3337 :
3338 : switch(*code)
3339 : {
3340 : case OP_CRSTAR:
3341 : case OP_CRMINSTAR:
3342 : case OP_CRPLUS:
3343 : case OP_CRMINPLUS:
3344 : case OP_CRQUERY:
3345 : case OP_CRMINQUERY:
3346 : printf("%s", OP_names[*code]);
3347 : break;
3348 :
3349 : case OP_CRRANGE:
3350 : case OP_CRMINRANGE:
3351 : min = (code[1] << 8) + code[2];
3352 : max = (code[3] << 8) + code[4];
3353 : if (max == 0) printf("{%d,}", min);
3354 : else printf("{%d,%d}", min, max);
3355 : if (*code == OP_CRMINRANGE) printf("?");
3356 : code += 4;
3357 : break;
3358 :
3359 : default:
3360 : code--;
3361 : }
3362 : }
3363 : break;
3364 :
3365 : /* Anything else is just a one-node item */
3366 :
3367 : default:
3368 : printf(" %s", OP_names[*code]);
3369 : break;
3370 : }
3371 :
3372 : code++;
3373 : printf("\n");
3374 : }
3375 : printf("------------------------------------------------------------------\n");
3376 :
3377 : /* This check is done here in the debugging case so that the code that
3378 : was compiled can be seen. */
3379 :
3380 : if (code - re->code > length)
3381 : {
3382 : *errorptr = ERR23;
3383 : (pcre_free)(re);
3384 : *erroroffset = ptr - (uschar *)pattern;
3385 : return NULL;
3386 : }
3387 : #endif
3388 :
3389 0 : return (pcre *)re;
3390 : }
3391 :
3392 :
3393 :
3394 : /*************************************************
3395 : * Match a back-reference *
3396 : *************************************************/
3397 :
3398 : /* If a back reference hasn't been set, the length that is passed is greater
3399 : than the number of characters left in the string, so the match fails.
3400 :
3401 : Arguments:
3402 : offset index into the offset vector
3403 : eptr points into the subject
3404 : length length to be matched
3405 : md points to match data block
3406 : ims the ims flags
3407 :
3408 : Returns: TRUE if matched
3409 : */
3410 :
3411 : static BOOL
3412 0 : match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3413 : unsigned long int ims)
3414 : {
3415 0 : const uschar *p = md->start_subject + md->offset_vector[offset];
3416 :
3417 : #ifdef DEBUG
3418 : if (eptr >= md->end_subject)
3419 : printf("matching subject <null>");
3420 : else
3421 : {
3422 : printf("matching subject ");
3423 : pchars(eptr, length, TRUE, md);
3424 : }
3425 : printf(" against backref ");
3426 : pchars(p, length, FALSE, md);
3427 : printf("\n");
3428 : #endif
3429 :
3430 : /* Always fail if not enough characters left */
3431 :
3432 0 : if (length > md->end_subject - eptr) return FALSE;
3433 :
3434 : /* Separate the caselesss case for speed */
3435 :
3436 0 : if ((ims & PCRE_CASELESS) != 0)
3437 : {
3438 0 : while (length-- > 0)
3439 0 : if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3440 : }
3441 : else
3442 0 : { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3443 :
3444 0 : return TRUE;
3445 : }
3446 :
3447 :
3448 :
3449 : /*************************************************
3450 : * Match from current position *
3451 : *************************************************/
3452 :
3453 : /* On entry ecode points to the first opcode, and eptr to the first character
3454 : in the subject string, while eptrb holds the value of eptr at the start of the
3455 : last bracketed group - used for breaking infinite loops matching zero-length
3456 : strings.
3457 :
3458 : Arguments:
3459 : eptr pointer in subject
3460 : ecode position in code
3461 : offset_top current top pointer
3462 : md pointer to "static" info for the match
3463 : ims current /i, /m, and /s options
3464 : eptrb pointer to chain of blocks containing eptr at start of
3465 : brackets - for testing for empty matches
3466 : flags can contain
3467 : match_condassert - this is an assertion condition
3468 : match_isgroup - this is the start of a bracketed group
3469 :
3470 : Returns: TRUE if matched
3471 : */
3472 :
3473 : __attribute__((no_sanitize("memory"))) __attribute__((no_sanitize_memory)) static BOOL
3474 0 : match(register const uschar *eptr, register const uschar *ecode,
3475 : int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3476 : int flags)
3477 : {
3478 0 : unsigned long int original_ims = ims; /* Save for resetting on ')' */
3479 : eptrblock newptrb;
3480 :
3481 : /* At the start of a bracketed group, add the current subject pointer to the
3482 : stack of such pointers, to be re-instated at the end of the group when we hit
3483 : the closing ket. When match() is called in other circumstances, we don't add to
3484 : the stack. */
3485 :
3486 0 : if ((flags & match_isgroup) != 0)
3487 : {
3488 0 : newptrb.prev = eptrb;
3489 0 : newptrb.saved_eptr = eptr;
3490 0 : eptrb = &newptrb;
3491 : }
3492 :
3493 : /* Now start processing the operations. */
3494 :
3495 : for (;;)
3496 0 : {
3497 0 : int op = (int)*ecode;
3498 : int min, max, ctype;
3499 : register int i;
3500 : register int c;
3501 0 : BOOL minimize = FALSE;
3502 :
3503 : /* Opening capturing bracket. If there is space in the offset vector, save
3504 : the current subject position in the working slot at the top of the vector. We
3505 : mustn't change the current values of the data slot, because they may be set
3506 : from a previous iteration of this group, and be referred to by a reference
3507 : inside the group.
3508 :
3509 : If the bracket fails to match, we need to restore this value and also the
3510 : values of the final offsets, in case they were set by a previous iteration of
3511 : the same bracket.
3512 :
3513 : If there isn't enough space in the offset vector, treat this as if it were a
3514 : non-capturing bracket. Don't worry about setting the flag for the error case
3515 : here; that is handled in the code for KET. */
3516 :
3517 0 : if (op > OP_BRA)
3518 : {
3519 0 : int number = op - OP_BRA;
3520 0 : int offset = number << 1;
3521 :
3522 : #ifdef DEBUG
3523 : printf("start bracket %d subject=", number);
3524 : pchars(eptr, 16, TRUE, md);
3525 : printf("\n");
3526 : #endif
3527 :
3528 0 : if (offset < md->offset_max)
3529 : {
3530 0 : int save_offset1 = md->offset_vector[offset];
3531 0 : int save_offset2 = md->offset_vector[offset+1];
3532 0 : int save_offset3 = md->offset_vector[md->offset_end - number];
3533 :
3534 : DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3535 0 : md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3536 :
3537 : do
3538 : {
3539 0 : if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3540 0 : return TRUE;
3541 0 : ecode += (ecode[1] << 8) + ecode[2];
3542 : }
3543 0 : while (*ecode == OP_ALT);
3544 :
3545 : DPRINTF(("bracket %d failed\n", number));
3546 :
3547 0 : md->offset_vector[offset] = save_offset1;
3548 0 : md->offset_vector[offset+1] = save_offset2;
3549 0 : md->offset_vector[md->offset_end - number] = save_offset3;
3550 0 : return FALSE;
3551 : }
3552 :
3553 : /* Insufficient room for saving captured contents */
3554 :
3555 0 : else op = OP_BRA;
3556 : }
3557 :
3558 : /* Other types of node can be handled by a switch */
3559 :
3560 0 : switch(op)
3561 : {
3562 0 : case OP_BRA: /* Non-capturing bracket: optimized */
3563 : DPRINTF(("start bracket 0\n"));
3564 : do
3565 : {
3566 0 : if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567 0 : return TRUE;
3568 0 : ecode += (ecode[1] << 8) + ecode[2];
3569 : }
3570 0 : while (*ecode == OP_ALT);
3571 : DPRINTF(("bracket 0 failed\n"));
3572 0 : return FALSE;
3573 :
3574 : /* Conditional group: compilation checked that there are no more than
3575 : two branches. If the condition is false, skipping the first branch takes us
3576 : past the end if there is only one branch, but that's OK because that is
3577 : exactly what going to the ket would do. */
3578 :
3579 0 : case OP_COND:
3580 0 : if (ecode[3] == OP_CREF) /* Condition is extraction test */
3581 : {
3582 0 : int offset = ecode[4] << 1; /* Doubled reference number */
3583 0 : return match(eptr,
3584 0 : ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585 0 : 5 : 3 + (ecode[1] << 8) + ecode[2]),
3586 : offset_top, md, ims, eptrb, match_isgroup);
3587 : }
3588 :
3589 : /* The condition is an assertion. Call match() to evaluate it - setting
3590 : the final argument TRUE causes it to stop at the end of an assertion. */
3591 :
3592 : else
3593 : {
3594 0 : if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595 : match_condassert | match_isgroup))
3596 : {
3597 0 : ecode += 3 + (ecode[4] << 8) + ecode[5];
3598 0 : while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599 : }
3600 0 : else ecode += (ecode[1] << 8) + ecode[2];
3601 0 : return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602 : }
3603 : /* Control never reaches here */
3604 :
3605 : /* Skip over conditional reference data if encountered (should not be) */
3606 :
3607 0 : case OP_CREF:
3608 0 : ecode += 2;
3609 0 : break;
3610 :
3611 : /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3612 : an empty string - recursion will then try other alternatives, if any. */
3613 :
3614 0 : case OP_END:
3615 0 : if (md->notempty && eptr == md->start_match) return FALSE;
3616 0 : md->end_match_ptr = eptr; /* Record where we ended */
3617 0 : md->end_offset_top = offset_top; /* and how many extracts were taken */
3618 0 : return TRUE;
3619 :
3620 : /* Change option settings */
3621 :
3622 0 : case OP_OPT:
3623 0 : ims = ecode[1];
3624 0 : ecode += 2;
3625 : DPRINTF(("ims set to %02lx\n", ims));
3626 0 : break;
3627 :
3628 : /* Assertion brackets. Check the alternative branches in turn - the
3629 : matching won't pass the KET for an assertion. If any one branch matches,
3630 : the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3631 : start of each branch to move the current point backwards, so the code at
3632 : this level is identical to the lookahead case. */
3633 :
3634 0 : case OP_ASSERT:
3635 : case OP_ASSERTBACK:
3636 : do
3637 : {
3638 0 : if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3639 0 : ecode += (ecode[1] << 8) + ecode[2];
3640 : }
3641 0 : while (*ecode == OP_ALT);
3642 0 : if (*ecode == OP_KET) return FALSE;
3643 :
3644 : /* If checking an assertion for a condition, return TRUE. */
3645 :
3646 0 : if ((flags & match_condassert) != 0) return TRUE;
3647 :
3648 : /* Continue from after the assertion, updating the offsets high water
3649 : mark, since extracts may have been taken during the assertion. */
3650 :
3651 0 : do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3652 0 : ecode += 3;
3653 0 : offset_top = md->end_offset_top;
3654 0 : continue;
3655 :
3656 : /* Negative assertion: all branches must fail to match */
3657 :
3658 0 : case OP_ASSERT_NOT:
3659 : case OP_ASSERTBACK_NOT:
3660 : do
3661 : {
3662 0 : if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3663 0 : return FALSE;
3664 0 : ecode += (ecode[1] << 8) + ecode[2];
3665 : }
3666 0 : while (*ecode == OP_ALT);
3667 :
3668 0 : if ((flags & match_condassert) != 0) return TRUE;
3669 :
3670 0 : ecode += 3;
3671 0 : continue;
3672 :
3673 : /* Move the subject pointer back. This occurs only at the start of
3674 : each branch of a lookbehind assertion. If we are too close to the start to
3675 : move back, this match function fails. When working with UTF-8 we move
3676 : back a number of characters, not bytes. */
3677 :
3678 0 : case OP_REVERSE:
3679 : #ifdef SUPPORT_UTF8
3680 : c = (ecode[1] << 8) + ecode[2];
3681 : for (i = 0; i < c; i++)
3682 : {
3683 : eptr--;
3684 : BACKCHAR(eptr)
3685 : }
3686 : #else
3687 0 : eptr -= (ecode[1] << 8) + ecode[2];
3688 : #endif
3689 :
3690 0 : if (eptr < md->start_subject) return FALSE;
3691 0 : ecode += 3;
3692 0 : break;
3693 :
3694 : /* Recursion matches the current regex, nested. If there are any capturing
3695 : brackets started but not finished, we have to save their starting points
3696 : and reinstate them after the recursion. However, we don't know how many
3697 : such there are (offset_top records the completed total) so we just have
3698 : to save all the potential data. There may be up to 99 such values, which
3699 : is a bit large to put on the stack, but using malloc for small numbers
3700 : seems expensive. As a compromise, the stack is used when there are fewer
3701 : than 16 values to store; otherwise malloc is used. A problem is what to do
3702 : if the malloc fails ... there is no way of returning to the top level with
3703 : an error. Save the top 15 values on the stack, and accept that the rest
3704 : may be wrong. */
3705 :
3706 0 : case OP_RECURSE:
3707 : {
3708 : BOOL rc;
3709 : int *save;
3710 : int stacksave[15];
3711 :
3712 0 : c = md->offset_max;
3713 :
3714 0 : if (c < 16) save = stacksave; else
3715 : {
3716 0 : save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3717 0 : if (save == NULL)
3718 : {
3719 0 : save = stacksave;
3720 0 : c = 15;
3721 : }
3722 : }
3723 :
3724 0 : for (i = 1; i <= c; i++)
3725 0 : save[i] = md->offset_vector[md->offset_end - i];
3726 0 : rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3727 : match_isgroup);
3728 0 : for (i = 1; i <= c; i++)
3729 0 : md->offset_vector[md->offset_end - i] = save[i];
3730 0 : if (save != stacksave) (pcre_free)(save);
3731 0 : if (!rc) return FALSE;
3732 :
3733 : /* In case the recursion has set more capturing values, save the final
3734 : number, then move along the subject till after the recursive match,
3735 : and advance one byte in the pattern code. */
3736 :
3737 0 : offset_top = md->end_offset_top;
3738 0 : eptr = md->end_match_ptr;
3739 0 : ecode++;
3740 : }
3741 0 : break;
3742 :
3743 : /* "Once" brackets are like assertion brackets except that after a match,
3744 : the point in the subject string is not moved back. Thus there can never be
3745 : a move back into the brackets. Check the alternative branches in turn - the
3746 : matching won't pass the KET for this kind of subpattern. If any one branch
3747 : matches, we carry on as at the end of a normal bracket, leaving the subject
3748 : pointer. */
3749 :
3750 0 : case OP_ONCE:
3751 : {
3752 0 : const uschar *prev = ecode;
3753 0 : const uschar *saved_eptr = eptr;
3754 :
3755 : do
3756 : {
3757 0 : if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3758 0 : break;
3759 0 : ecode += (ecode[1] << 8) + ecode[2];
3760 : }
3761 0 : while (*ecode == OP_ALT);
3762 :
3763 : /* If hit the end of the group (which could be repeated), fail */
3764 :
3765 0 : if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3766 :
3767 : /* Continue as from after the assertion, updating the offsets high water
3768 : mark, since extracts may have been taken. */
3769 :
3770 0 : do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3771 :
3772 0 : offset_top = md->end_offset_top;
3773 0 : eptr = md->end_match_ptr;
3774 :
3775 : /* For a non-repeating ket, just continue at this level. This also
3776 : happens for a repeating ket if no characters were matched in the group.
3777 : This is the forcible breaking of infinite loops as implemented in Perl
3778 : 5.005. If there is an options reset, it will get obeyed in the normal
3779 : course of events. */
3780 :
3781 0 : if (*ecode == OP_KET || eptr == saved_eptr)
3782 : {
3783 0 : ecode += 3;
3784 0 : break;
3785 : }
3786 :
3787 : /* The repeating kets try the rest of the pattern or restart from the
3788 : preceding bracket, in the appropriate order. We need to reset any options
3789 : that changed within the bracket before re-running it, so check the next
3790 : opcode. */
3791 :
3792 0 : if (ecode[3] == OP_OPT)
3793 : {
3794 0 : ims = (ims & ~PCRE_IMS) | ecode[4];
3795 : DPRINTF(("ims set to %02lx at group repeat\n", ims));
3796 : }
3797 :
3798 0 : if (*ecode == OP_KETRMIN)
3799 : {
3800 0 : if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3801 0 : match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3802 0 : return TRUE;
3803 : }
3804 : else /* OP_KETRMAX */
3805 : {
3806 0 : if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3807 0 : match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3808 : }
3809 : }
3810 0 : return FALSE;
3811 :
3812 : /* An alternation is the end of a branch; scan along to find the end of the
3813 : bracketed group and go to there. */
3814 :
3815 0 : case OP_ALT:
3816 0 : do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3817 0 : break;
3818 :
3819 : /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3820 : that it may occur zero times. It may repeat infinitely, or not at all -
3821 : i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3822 : repeat limits are compiled as a number of copies, with the optional ones
3823 : preceded by BRAZERO or BRAMINZERO. */
3824 :
3825 0 : case OP_BRAZERO:
3826 : {
3827 0 : const uschar *next = ecode+1;
3828 0 : if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3829 0 : return TRUE;
3830 0 : do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3831 0 : ecode = next + 3;
3832 : }
3833 0 : break;
3834 :
3835 0 : case OP_BRAMINZERO:
3836 : {
3837 0 : const uschar *next = ecode+1;
3838 0 : do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3839 0 : if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3840 0 : return TRUE;
3841 0 : ecode++;
3842 : }
3843 0 : break;
3844 :
3845 : /* End of a group, repeated or non-repeating. If we are at the end of
3846 : an assertion "group", stop matching and return TRUE, but record the
3847 : current high water mark for use by positive assertions. Do this also
3848 : for the "once" (not-backup up) groups. */
3849 :
3850 0 : case OP_KET:
3851 : case OP_KETRMIN:
3852 : case OP_KETRMAX:
3853 : {
3854 0 : const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3855 0 : const uschar *saved_eptr = eptrb->saved_eptr;
3856 :
3857 0 : eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3858 :
3859 0 : if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3860 0 : *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3861 0 : *prev == OP_ONCE)
3862 : {
3863 0 : md->end_match_ptr = eptr; /* For ONCE */
3864 0 : md->end_offset_top = offset_top;
3865 0 : return TRUE;
3866 : }
3867 :
3868 : /* In all other cases except a conditional group we have to check the
3869 : group number back at the start and if necessary complete handling an
3870 : extraction by setting the offsets and bumping the high water mark. */
3871 :
3872 0 : if (*prev != OP_COND)
3873 : {
3874 0 : int number = *prev - OP_BRA;
3875 0 : int offset = number << 1;
3876 :
3877 : #ifdef DEBUG
3878 : printf("end bracket %d", number);
3879 : printf("\n");
3880 : #endif
3881 :
3882 0 : if (number > 0)
3883 : {
3884 0 : if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3885 : {
3886 0 : md->offset_vector[offset] =
3887 0 : md->offset_vector[md->offset_end - number];
3888 0 : md->offset_vector[offset+1] = eptr - md->start_subject;
3889 0 : if (offset_top <= offset) offset_top = offset + 2;
3890 : }
3891 : }
3892 : }
3893 :
3894 : /* Reset the value of the ims flags, in case they got changed during
3895 : the group. */
3896 :
3897 0 : ims = original_ims;
3898 : DPRINTF(("ims reset to %02lx\n", ims));
3899 :
3900 : /* For a non-repeating ket, just continue at this level. This also
3901 : happens for a repeating ket if no characters were matched in the group.
3902 : This is the forcible breaking of infinite loops as implemented in Perl
3903 : 5.005. If there is an options reset, it will get obeyed in the normal
3904 : course of events. */
3905 :
3906 0 : if (*ecode == OP_KET || eptr == saved_eptr)
3907 : {
3908 0 : ecode += 3;
3909 0 : break;
3910 : }
3911 :
3912 : /* The repeating kets try the rest of the pattern or restart from the
3913 : preceding bracket, in the appropriate order. */
3914 :
3915 0 : if (*ecode == OP_KETRMIN)
3916 : {
3917 0 : if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3918 0 : match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3919 0 : return TRUE;
3920 : }
3921 : else /* OP_KETRMAX */
3922 : {
3923 0 : if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3924 0 : match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3925 : }
3926 : }
3927 0 : return FALSE;
3928 :
3929 : /* Start of subject unless notbol, or after internal newline if multiline */
3930 :
3931 0 : case OP_CIRC:
3932 0 : if (md->notbol && eptr == md->start_subject) return FALSE;
3933 0 : if ((ims & PCRE_MULTILINE) != 0)
3934 : {
3935 0 : if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3936 0 : ecode++;
3937 0 : break;
3938 : }
3939 : /* ... else fall through */
3940 :
3941 : /* Start of subject assertion */
3942 :
3943 : case OP_SOD:
3944 0 : if (eptr != md->start_subject) return FALSE;
3945 0 : ecode++;
3946 0 : break;
3947 :
3948 : /* Assert before internal newline if multiline, or before a terminating
3949 : newline unless endonly is set, else end of subject unless noteol is set. */
3950 :
3951 0 : case OP_DOLL:
3952 0 : if ((ims & PCRE_MULTILINE) != 0)
3953 : {
3954 0 : if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3955 0 : else { if (md->noteol) return FALSE; }
3956 0 : ecode++;
3957 0 : break;
3958 : }
3959 : else
3960 : {
3961 0 : if (md->noteol) return FALSE;
3962 0 : if (!md->endonly)
3963 : {
3964 0 : if (eptr < md->end_subject - 1 ||
3965 0 : (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3966 :
3967 0 : ecode++;
3968 0 : break;
3969 : }
3970 : }
3971 : /* ... else fall through */
3972 :
3973 : /* End of subject assertion (\z) */
3974 :
3975 : case OP_EOD:
3976 0 : if (eptr < md->end_subject) return FALSE;
3977 0 : ecode++;
3978 0 : break;
3979 :
3980 : /* End of subject or ending \n assertion (\Z) */
3981 :
3982 0 : case OP_EODN:
3983 0 : if (eptr < md->end_subject - 1 ||
3984 0 : (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3985 0 : ecode++;
3986 0 : break;
3987 :
3988 : /* Word boundary assertions */
3989 :
3990 0 : case OP_NOT_WORD_BOUNDARY:
3991 : case OP_WORD_BOUNDARY:
3992 : {
3993 0 : BOOL prev_is_word = (eptr != md->start_subject) &&
3994 0 : ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3995 0 : BOOL cur_is_word = (eptr < md->end_subject) &&
3996 0 : ((md->ctypes[*eptr] & ctype_word) != 0);
3997 0 : if ((*ecode++ == OP_WORD_BOUNDARY)?
3998 : cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3999 0 : return FALSE;
4000 : }
4001 0 : break;
4002 :
4003 : /* Match a single character type; inline for speed */
4004 :
4005 0 : case OP_ANY:
4006 0 : if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
4007 0 : return FALSE;
4008 0 : if (eptr++ >= md->end_subject) return FALSE;
4009 : #ifdef SUPPORT_UTF8
4010 : if (md->utf8)
4011 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4012 : #endif
4013 0 : ecode++;
4014 0 : break;
4015 :
4016 0 : case OP_NOT_DIGIT:
4017 0 : if (eptr >= md->end_subject ||
4018 0 : (md->ctypes[*eptr++] & ctype_digit) != 0)
4019 0 : return FALSE;
4020 0 : ecode++;
4021 0 : break;
4022 :
4023 0 : case OP_DIGIT:
4024 0 : if (eptr >= md->end_subject ||
4025 0 : (md->ctypes[*eptr++] & ctype_digit) == 0)
4026 0 : return FALSE;
4027 0 : ecode++;
4028 0 : break;
4029 :
4030 0 : case OP_NOT_WHITESPACE:
4031 0 : if (eptr >= md->end_subject ||
4032 0 : (md->ctypes[*eptr++] & ctype_space) != 0)
4033 0 : return FALSE;
4034 0 : ecode++;
4035 0 : break;
4036 :
4037 0 : case OP_WHITESPACE:
4038 0 : if (eptr >= md->end_subject ||
4039 0 : (md->ctypes[*eptr++] & ctype_space) == 0)
4040 0 : return FALSE;
4041 0 : ecode++;
4042 0 : break;
4043 :
4044 0 : case OP_NOT_WORDCHAR:
4045 0 : if (eptr >= md->end_subject ||
4046 0 : (md->ctypes[*eptr++] & ctype_word) != 0)
4047 0 : return FALSE;
4048 0 : ecode++;
4049 0 : break;
4050 :
4051 0 : case OP_WORDCHAR:
4052 0 : if (eptr >= md->end_subject ||
4053 0 : (md->ctypes[*eptr++] & ctype_word) == 0)
4054 0 : return FALSE;
4055 0 : ecode++;
4056 0 : break;
4057 :
4058 : /* Match a back reference, possibly repeatedly. Look past the end of the
4059 : item to see if there is repeat information following. The code is similar
4060 : to that for character classes, but repeated for efficiency. Then obey
4061 : similar code to character type repeats - written out again for speed.
4062 : However, if the referenced string is the empty string, always treat
4063 : it as matched, any number of times (otherwise there could be infinite
4064 : loops). */
4065 :
4066 0 : case OP_REF:
4067 : {
4068 : int length;
4069 0 : int offset = ecode[1] << 1; /* Doubled reference number */
4070 0 : ecode += 2; /* Advance past the item */
4071 :
4072 : /* If the reference is unset, set the length to be longer than the amount
4073 : of subject left; this ensures that every attempt at a match fails. We
4074 : can't just fail here, because of the possibility of quantifiers with zero
4075 : minima. */
4076 :
4077 0 : length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4078 0 : md->end_subject - eptr + 1 :
4079 0 : md->offset_vector[offset+1] - md->offset_vector[offset];
4080 :
4081 : /* Set up for repetition, or handle the non-repeated case */
4082 :
4083 0 : switch (*ecode)
4084 : {
4085 0 : case OP_CRSTAR:
4086 : case OP_CRMINSTAR:
4087 : case OP_CRPLUS:
4088 : case OP_CRMINPLUS:
4089 : case OP_CRQUERY:
4090 : case OP_CRMINQUERY:
4091 0 : c = *ecode++ - OP_CRSTAR;
4092 0 : minimize = (c & 1) != 0;
4093 0 : min = rep_min[c]; /* Pick up values from tables; */
4094 0 : max = rep_max[c]; /* zero for max => infinity */
4095 0 : if (max == 0) max = INT_MAX;
4096 0 : break;
4097 :
4098 0 : case OP_CRRANGE:
4099 : case OP_CRMINRANGE:
4100 0 : minimize = (*ecode == OP_CRMINRANGE);
4101 0 : min = (ecode[1] << 8) + ecode[2];
4102 0 : max = (ecode[3] << 8) + ecode[4];
4103 0 : if (max == 0) max = INT_MAX;
4104 0 : ecode += 5;
4105 0 : break;
4106 :
4107 0 : default: /* No repeat follows */
4108 0 : if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4109 0 : eptr += length;
4110 0 : continue; /* With the main loop */
4111 : }
4112 :
4113 : /* If the length of the reference is zero, just continue with the
4114 : main loop. */
4115 :
4116 0 : if (length == 0) continue;
4117 :
4118 : /* First, ensure the minimum number of matches are present. We get back
4119 : the length of the reference string explicitly rather than passing the
4120 : address of eptr, so that eptr can be a register variable. */
4121 :
4122 0 : for (i = 1; i <= min; i++)
4123 : {
4124 0 : if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4125 0 : eptr += length;
4126 : }
4127 :
4128 : /* If min = max, continue at the same level without recursion.
4129 : They are not both allowed to be zero. */
4130 :
4131 0 : if (min == max) continue;
4132 :
4133 : /* If minimizing, keep trying and advancing the pointer */
4134 :
4135 0 : if (minimize)
4136 : {
4137 0 : for (i = min;; i++)
4138 : {
4139 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4140 0 : return TRUE;
4141 0 : if (i >= max || !match_ref(offset, eptr, length, md, ims))
4142 0 : return FALSE;
4143 0 : eptr += length;
4144 : }
4145 : /* Control never gets here */
4146 : }
4147 :
4148 : /* If maximizing, find the longest string and work backwards */
4149 :
4150 : else
4151 : {
4152 0 : const uschar *pp = eptr;
4153 0 : for (i = min; i < max; i++)
4154 : {
4155 0 : if (!match_ref(offset, eptr, length, md, ims)) break;
4156 0 : eptr += length;
4157 : }
4158 0 : while (eptr >= pp)
4159 : {
4160 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4161 0 : return TRUE;
4162 0 : eptr -= length;
4163 : }
4164 0 : return FALSE;
4165 : }
4166 : }
4167 : /* Control never gets here */
4168 :
4169 :
4170 :
4171 : /* Match a character class, possibly repeatedly. Look past the end of the
4172 : item to see if there is repeat information following. Then obey similar
4173 : code to character type repeats - written out again for speed. */
4174 :
4175 0 : case OP_CLASS:
4176 : {
4177 0 : const uschar *data = ecode + 1; /* Save for matching */
4178 0 : ecode += 33; /* Advance past the item */
4179 :
4180 0 : switch (*ecode)
4181 : {
4182 0 : case OP_CRSTAR:
4183 : case OP_CRMINSTAR:
4184 : case OP_CRPLUS:
4185 : case OP_CRMINPLUS:
4186 : case OP_CRQUERY:
4187 : case OP_CRMINQUERY:
4188 0 : c = *ecode++ - OP_CRSTAR;
4189 0 : minimize = (c & 1) != 0;
4190 0 : min = rep_min[c]; /* Pick up values from tables; */
4191 0 : max = rep_max[c]; /* zero for max => infinity */
4192 0 : if (max == 0) max = INT_MAX;
4193 0 : break;
4194 :
4195 0 : case OP_CRRANGE:
4196 : case OP_CRMINRANGE:
4197 0 : minimize = (*ecode == OP_CRMINRANGE);
4198 0 : min = (ecode[1] << 8) + ecode[2];
4199 0 : max = (ecode[3] << 8) + ecode[4];
4200 0 : if (max == 0) max = INT_MAX;
4201 0 : ecode += 5;
4202 0 : break;
4203 :
4204 0 : default: /* No repeat follows */
4205 0 : min = max = 1;
4206 0 : break;
4207 : }
4208 :
4209 : /* First, ensure the minimum number of matches are present. */
4210 :
4211 0 : for (i = 1; i <= min; i++)
4212 : {
4213 0 : if (eptr >= md->end_subject) return FALSE;
4214 0 : GETCHARINC(c, eptr) /* Get character; increment eptr */
4215 :
4216 : #ifdef SUPPORT_UTF8
4217 : /* We do not yet support class members > 255 */
4218 : if (c > 255) return FALSE;
4219 : #endif
4220 :
4221 0 : if ((data[c/8] & (1 << (c&7))) != 0) continue;
4222 0 : return FALSE;
4223 : }
4224 :
4225 : /* If max == min we can continue with the main loop without the
4226 : need to recurse. */
4227 :
4228 0 : if (min == max) continue;
4229 :
4230 : /* If minimizing, keep testing the rest of the expression and advancing
4231 : the pointer while it matches the class. */
4232 :
4233 0 : if (minimize)
4234 : {
4235 0 : for (i = min;; i++)
4236 : {
4237 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4238 0 : return TRUE;
4239 0 : if (i >= max || eptr >= md->end_subject) return FALSE;
4240 0 : GETCHARINC(c, eptr) /* Get character; increment eptr */
4241 :
4242 : #ifdef SUPPORT_UTF8
4243 : /* We do not yet support class members > 255 */
4244 : if (c > 255) return FALSE;
4245 : #endif
4246 0 : if ((data[c/8] & (1 << (c&7))) != 0) continue;
4247 0 : return FALSE;
4248 : }
4249 : /* Control never gets here */
4250 : }
4251 :
4252 : /* If maximizing, find the longest possible run, then work backwards. */
4253 :
4254 : else
4255 : {
4256 0 : const uschar *pp = eptr;
4257 0 : int len = 1;
4258 0 : for (i = min; i < max; i++)
4259 : {
4260 0 : if (eptr >= md->end_subject) break;
4261 0 : GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4262 :
4263 : #ifdef SUPPORT_UTF8
4264 : /* We do not yet support class members > 255 */
4265 : if (c > 255) break;
4266 : #endif
4267 0 : if ((data[c/8] & (1 << (c&7))) == 0) break;
4268 0 : eptr += len;
4269 : }
4270 :
4271 0 : while (eptr >= pp)
4272 : {
4273 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4274 0 : return TRUE;
4275 :
4276 : #ifdef SUPPORT_UTF8
4277 : BACKCHAR(eptr)
4278 : #endif
4279 : }
4280 0 : return FALSE;
4281 : }
4282 : }
4283 : /* Control never gets here */
4284 :
4285 : /* Match a run of characters */
4286 :
4287 0 : case OP_CHARS:
4288 : {
4289 0 : register int length = ecode[1];
4290 0 : ecode += 2;
4291 :
4292 : #ifdef DEBUG /* Sigh. Some compilers never learn. */
4293 : if (eptr >= md->end_subject)
4294 : printf("matching subject <null> against pattern ");
4295 : else
4296 : {
4297 : printf("matching subject ");
4298 : pchars(eptr, length, TRUE, md);
4299 : printf(" against pattern ");
4300 : }
4301 : pchars(ecode, length, FALSE, md);
4302 : printf("\n");
4303 : #endif
4304 :
4305 0 : if (length > md->end_subject - eptr) return FALSE;
4306 0 : if ((ims & PCRE_CASELESS) != 0)
4307 : {
4308 0 : while (length-- > 0)
4309 0 : if (md->lcc[*ecode++] != md->lcc[*eptr++])
4310 0 : return FALSE;
4311 : }
4312 : else
4313 : {
4314 0 : while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4315 : }
4316 : }
4317 0 : break;
4318 :
4319 : /* Match a single character repeatedly; different opcodes share code. */
4320 :
4321 0 : case OP_EXACT:
4322 0 : min = max = (ecode[1] << 8) + ecode[2];
4323 0 : ecode += 3;
4324 0 : goto REPEATCHAR;
4325 :
4326 0 : case OP_UPTO:
4327 : case OP_MINUPTO:
4328 0 : min = 0;
4329 0 : max = (ecode[1] << 8) + ecode[2];
4330 0 : minimize = *ecode == OP_MINUPTO;
4331 0 : ecode += 3;
4332 0 : goto REPEATCHAR;
4333 :
4334 0 : case OP_STAR:
4335 : case OP_MINSTAR:
4336 : case OP_PLUS:
4337 : case OP_MINPLUS:
4338 : case OP_QUERY:
4339 : case OP_MINQUERY:
4340 0 : c = *ecode++ - OP_STAR;
4341 0 : minimize = (c & 1) != 0;
4342 0 : min = rep_min[c]; /* Pick up values from tables; */
4343 0 : max = rep_max[c]; /* zero for max => infinity */
4344 0 : if (max == 0) max = INT_MAX;
4345 :
4346 : /* Common code for all repeated single-character matches. We can give
4347 : up quickly if there are fewer than the minimum number of characters left in
4348 : the subject. */
4349 :
4350 0 : REPEATCHAR:
4351 0 : if (min > md->end_subject - eptr) return FALSE;
4352 0 : c = *ecode++;
4353 :
4354 : /* The code is duplicated for the caseless and caseful cases, for speed,
4355 : since matching characters is likely to be quite common. First, ensure the
4356 : minimum number of matches are present. If min = max, continue at the same
4357 : level without recursing. Otherwise, if minimizing, keep trying the rest of
4358 : the expression and advancing one matching character if failing, up to the
4359 : maximum. Alternatively, if maximizing, find the maximum number of
4360 : characters and work backwards. */
4361 :
4362 : DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4363 : max, eptr));
4364 :
4365 0 : if ((ims & PCRE_CASELESS) != 0)
4366 : {
4367 0 : c = md->lcc[c];
4368 0 : for (i = 1; i <= min; i++)
4369 0 : if (c != md->lcc[*eptr++]) return FALSE;
4370 0 : if (min == max) continue;
4371 0 : if (minimize)
4372 : {
4373 0 : for (i = min;; i++)
4374 : {
4375 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4376 0 : return TRUE;
4377 0 : if (i >= max || eptr >= md->end_subject ||
4378 0 : c != md->lcc[*eptr++])
4379 0 : return FALSE;
4380 : }
4381 : /* Control never gets here */
4382 : }
4383 : else
4384 : {
4385 0 : const uschar *pp = eptr;
4386 0 : for (i = min; i < max; i++)
4387 : {
4388 0 : if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4389 0 : eptr++;
4390 : }
4391 0 : while (eptr >= pp)
4392 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4393 0 : return TRUE;
4394 0 : return FALSE;
4395 : }
4396 : /* Control never gets here */
4397 : }
4398 :
4399 : /* Caseful comparisons */
4400 :
4401 : else
4402 : {
4403 0 : for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4404 0 : if (min == max) continue;
4405 0 : if (minimize)
4406 : {
4407 0 : for (i = min;; i++)
4408 : {
4409 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4410 0 : return TRUE;
4411 0 : if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4412 : }
4413 : /* Control never gets here */
4414 : }
4415 : else
4416 : {
4417 0 : const uschar *pp = eptr;
4418 0 : for (i = min; i < max; i++)
4419 : {
4420 0 : if (eptr >= md->end_subject || c != *eptr) break;
4421 0 : eptr++;
4422 : }
4423 0 : while (eptr >= pp)
4424 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4425 0 : return TRUE;
4426 0 : return FALSE;
4427 : }
4428 : }
4429 : /* Control never gets here */
4430 :
4431 : /* Match a negated single character */
4432 :
4433 0 : case OP_NOT:
4434 0 : if (eptr >= md->end_subject) return FALSE;
4435 0 : ecode++;
4436 0 : if ((ims & PCRE_CASELESS) != 0)
4437 : {
4438 0 : if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4439 : }
4440 : else
4441 : {
4442 0 : if (*ecode++ == *eptr++) return FALSE;
4443 : }
4444 0 : break;
4445 :
4446 : /* Match a negated single character repeatedly. This is almost a repeat of
4447 : the code for a repeated single character, but I haven't found a nice way of
4448 : commoning these up that doesn't require a test of the positive/negative
4449 : option for each character match. Maybe that wouldn't add very much to the
4450 : time taken, but character matching *is* what this is all about... */
4451 :
4452 0 : case OP_NOTEXACT:
4453 0 : min = max = (ecode[1] << 8) + ecode[2];
4454 0 : ecode += 3;
4455 0 : goto REPEATNOTCHAR;
4456 :
4457 0 : case OP_NOTUPTO:
4458 : case OP_NOTMINUPTO:
4459 0 : min = 0;
4460 0 : max = (ecode[1] << 8) + ecode[2];
4461 0 : minimize = *ecode == OP_NOTMINUPTO;
4462 0 : ecode += 3;
4463 0 : goto REPEATNOTCHAR;
4464 :
4465 0 : case OP_NOTSTAR:
4466 : case OP_NOTMINSTAR:
4467 : case OP_NOTPLUS:
4468 : case OP_NOTMINPLUS:
4469 : case OP_NOTQUERY:
4470 : case OP_NOTMINQUERY:
4471 0 : c = *ecode++ - OP_NOTSTAR;
4472 0 : minimize = (c & 1) != 0;
4473 0 : min = rep_min[c]; /* Pick up values from tables; */
4474 0 : max = rep_max[c]; /* zero for max => infinity */
4475 0 : if (max == 0) max = INT_MAX;
4476 :
4477 : /* Common code for all repeated single-character matches. We can give
4478 : up quickly if there are fewer than the minimum number of characters left in
4479 : the subject. */
4480 :
4481 0 : REPEATNOTCHAR:
4482 0 : if (min > md->end_subject - eptr) return FALSE;
4483 0 : c = *ecode++;
4484 :
4485 : /* The code is duplicated for the caseless and caseful cases, for speed,
4486 : since matching characters is likely to be quite common. First, ensure the
4487 : minimum number of matches are present. If min = max, continue at the same
4488 : level without recursing. Otherwise, if minimizing, keep trying the rest of
4489 : the expression and advancing one matching character if failing, up to the
4490 : maximum. Alternatively, if maximizing, find the maximum number of
4491 : characters and work backwards. */
4492 :
4493 : DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4494 : max, eptr));
4495 :
4496 0 : if ((ims & PCRE_CASELESS) != 0)
4497 : {
4498 0 : c = md->lcc[c];
4499 0 : for (i = 1; i <= min; i++)
4500 0 : if (c == md->lcc[*eptr++]) return FALSE;
4501 0 : if (min == max) continue;
4502 0 : if (minimize)
4503 : {
4504 0 : for (i = min;; i++)
4505 : {
4506 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4507 0 : return TRUE;
4508 0 : if (i >= max || eptr >= md->end_subject ||
4509 0 : c == md->lcc[*eptr++])
4510 0 : return FALSE;
4511 : }
4512 : /* Control never gets here */
4513 : }
4514 : else
4515 : {
4516 0 : const uschar *pp = eptr;
4517 0 : for (i = min; i < max; i++)
4518 : {
4519 0 : if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4520 0 : eptr++;
4521 : }
4522 0 : while (eptr >= pp)
4523 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4524 0 : return TRUE;
4525 0 : return FALSE;
4526 : }
4527 : /* Control never gets here */
4528 : }
4529 :
4530 : /* Caseful comparisons */
4531 :
4532 : else
4533 : {
4534 0 : for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4535 0 : if (min == max) continue;
4536 0 : if (minimize)
4537 : {
4538 0 : for (i = min;; i++)
4539 : {
4540 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4541 0 : return TRUE;
4542 0 : if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4543 : }
4544 : /* Control never gets here */
4545 : }
4546 : else
4547 : {
4548 0 : const uschar *pp = eptr;
4549 0 : for (i = min; i < max; i++)
4550 : {
4551 0 : if (eptr >= md->end_subject || c == *eptr) break;
4552 0 : eptr++;
4553 : }
4554 0 : while (eptr >= pp)
4555 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4556 0 : return TRUE;
4557 0 : return FALSE;
4558 : }
4559 : }
4560 : /* Control never gets here */
4561 :
4562 : /* Match a single character type repeatedly; several different opcodes
4563 : share code. This is very similar to the code for single characters, but we
4564 : repeat it in the interests of efficiency. */
4565 :
4566 0 : case OP_TYPEEXACT:
4567 0 : min = max = (ecode[1] << 8) + ecode[2];
4568 0 : minimize = TRUE;
4569 0 : ecode += 3;
4570 0 : goto REPEATTYPE;
4571 :
4572 0 : case OP_TYPEUPTO:
4573 : case OP_TYPEMINUPTO:
4574 0 : min = 0;
4575 0 : max = (ecode[1] << 8) + ecode[2];
4576 0 : minimize = *ecode == OP_TYPEMINUPTO;
4577 0 : ecode += 3;
4578 0 : goto REPEATTYPE;
4579 :
4580 0 : case OP_TYPESTAR:
4581 : case OP_TYPEMINSTAR:
4582 : case OP_TYPEPLUS:
4583 : case OP_TYPEMINPLUS:
4584 : case OP_TYPEQUERY:
4585 : case OP_TYPEMINQUERY:
4586 0 : c = *ecode++ - OP_TYPESTAR;
4587 0 : minimize = (c & 1) != 0;
4588 0 : min = rep_min[c]; /* Pick up values from tables; */
4589 0 : max = rep_max[c]; /* zero for max => infinity */
4590 0 : if (max == 0) max = INT_MAX;
4591 :
4592 : /* Common code for all repeated single character type matches */
4593 :
4594 0 : REPEATTYPE:
4595 0 : ctype = *ecode++; /* Code for the character type */
4596 :
4597 : /* First, ensure the minimum number of matches are present. Use inline
4598 : code for maximizing the speed, and do the type test once at the start
4599 : (i.e. keep it out of the loop). Also we can test that there are at least
4600 : the minimum number of bytes before we start, except when doing '.' in
4601 : UTF8 mode. Leave the test in in all cases; in the special case we have
4602 : to test after each character. */
4603 :
4604 0 : if (min > md->end_subject - eptr) return FALSE;
4605 0 : if (min > 0) switch(ctype)
4606 : {
4607 0 : case OP_ANY:
4608 : #ifdef SUPPORT_UTF8
4609 : if (md->utf8)
4610 : {
4611 : for (i = 1; i <= min; i++)
4612 : {
4613 : if (eptr >= md->end_subject ||
4614 : (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4615 : return FALSE;
4616 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4617 : }
4618 : break;
4619 : }
4620 : #endif
4621 : /* Non-UTF8 can be faster */
4622 0 : if ((ims & PCRE_DOTALL) == 0)
4623 0 : { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4624 0 : else eptr += min;
4625 0 : break;
4626 :
4627 0 : case OP_NOT_DIGIT:
4628 0 : for (i = 1; i <= min; i++)
4629 0 : if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4630 0 : break;
4631 :
4632 0 : case OP_DIGIT:
4633 0 : for (i = 1; i <= min; i++)
4634 0 : if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4635 0 : break;
4636 :
4637 0 : case OP_NOT_WHITESPACE:
4638 0 : for (i = 1; i <= min; i++)
4639 0 : if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4640 0 : break;
4641 :
4642 0 : case OP_WHITESPACE:
4643 0 : for (i = 1; i <= min; i++)
4644 0 : if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4645 0 : break;
4646 :
4647 0 : case OP_NOT_WORDCHAR:
4648 0 : for (i = 1; i <= min; i++)
4649 0 : if ((md->ctypes[*eptr++] & ctype_word) != 0)
4650 0 : return FALSE;
4651 0 : break;
4652 :
4653 0 : case OP_WORDCHAR:
4654 0 : for (i = 1; i <= min; i++)
4655 0 : if ((md->ctypes[*eptr++] & ctype_word) == 0)
4656 0 : return FALSE;
4657 0 : break;
4658 : }
4659 :
4660 : /* If min = max, continue at the same level without recursing */
4661 :
4662 0 : if (min == max) continue;
4663 :
4664 : /* If minimizing, we have to test the rest of the pattern before each
4665 : subsequent match. */
4666 :
4667 0 : if (minimize)
4668 : {
4669 0 : for (i = min;; i++)
4670 : {
4671 0 : if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4672 0 : if (i >= max || eptr >= md->end_subject) return FALSE;
4673 :
4674 0 : c = *eptr++;
4675 : switch(ctype)
4676 : {
4677 0 : case OP_ANY:
4678 0 : if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4679 : #ifdef SUPPORT_UTF8
4680 : if (md->utf8)
4681 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4682 : #endif
4683 0 : break;
4684 :
4685 0 : case OP_NOT_DIGIT:
4686 0 : if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4687 0 : break;
4688 :
4689 0 : case OP_DIGIT:
4690 0 : if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4691 0 : break;
4692 :
4693 0 : case OP_NOT_WHITESPACE:
4694 0 : if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4695 0 : break;
4696 :
4697 0 : case OP_WHITESPACE:
4698 0 : if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4699 0 : break;
4700 :
4701 0 : case OP_NOT_WORDCHAR:
4702 0 : if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4703 0 : break;
4704 :
4705 0 : case OP_WORDCHAR:
4706 0 : if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4707 0 : break;
4708 : }
4709 0 : }
4710 : /* Control never gets here */
4711 : }
4712 :
4713 : /* If maximizing it is worth using inline code for speed, doing the type
4714 : test once at the start (i.e. keep it out of the loop). */
4715 :
4716 : else
4717 : {
4718 0 : const uschar *pp = eptr;
4719 : switch(ctype)
4720 : {
4721 0 : case OP_ANY:
4722 :
4723 : /* Special code is required for UTF8, but when the maximum is unlimited
4724 : we don't need it. */
4725 :
4726 : #ifdef SUPPORT_UTF8
4727 : if (md->utf8 && max < INT_MAX)
4728 : {
4729 : if ((ims & PCRE_DOTALL) == 0)
4730 : {
4731 : for (i = min; i < max; i++)
4732 : {
4733 : if (eptr >= md->end_subject || *eptr++ == '\n') break;
4734 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4735 : }
4736 : }
4737 : else
4738 : {
4739 : for (i = min; i < max; i++)
4740 : {
4741 : eptr++;
4742 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743 : }
4744 : }
4745 : break;
4746 : }
4747 : #endif
4748 : /* Non-UTF8 can be faster */
4749 0 : if ((ims & PCRE_DOTALL) == 0)
4750 : {
4751 0 : for (i = min; i < max; i++)
4752 : {
4753 0 : if (eptr >= md->end_subject || *eptr == '\n') break;
4754 0 : eptr++;
4755 : }
4756 : }
4757 : else
4758 : {
4759 0 : c = max - min;
4760 0 : if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4761 0 : eptr += c;
4762 : }
4763 0 : break;
4764 :
4765 0 : case OP_NOT_DIGIT:
4766 0 : for (i = min; i < max; i++)
4767 : {
4768 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4769 : break;
4770 0 : eptr++;
4771 : }
4772 0 : break;
4773 :
4774 0 : case OP_DIGIT:
4775 0 : for (i = min; i < max; i++)
4776 : {
4777 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4778 : break;
4779 0 : eptr++;
4780 : }
4781 0 : break;
4782 :
4783 0 : case OP_NOT_WHITESPACE:
4784 0 : for (i = min; i < max; i++)
4785 : {
4786 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4787 : break;
4788 0 : eptr++;
4789 : }
4790 0 : break;
4791 :
4792 0 : case OP_WHITESPACE:
4793 0 : for (i = min; i < max; i++)
4794 : {
4795 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4796 : break;
4797 0 : eptr++;
4798 : }
4799 0 : break;
4800 :
4801 0 : case OP_NOT_WORDCHAR:
4802 0 : for (i = min; i < max; i++)
4803 : {
4804 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4805 : break;
4806 0 : eptr++;
4807 : }
4808 0 : break;
4809 :
4810 0 : case OP_WORDCHAR:
4811 0 : for (i = min; i < max; i++)
4812 : {
4813 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4814 : break;
4815 0 : eptr++;
4816 : }
4817 0 : break;
4818 : }
4819 :
4820 0 : while (eptr >= pp)
4821 : {
4822 0 : if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4823 0 : return TRUE;
4824 : #ifdef SUPPORT_UTF8
4825 : if (md->utf8)
4826 : while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4827 : #endif
4828 : }
4829 0 : return FALSE;
4830 : }
4831 : /* Control never gets here */
4832 :
4833 : /* There's been some horrible disaster. */
4834 :
4835 0 : default:
4836 : DPRINTF(("Unknown opcode %d\n", *ecode));
4837 0 : md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4838 0 : return FALSE;
4839 : }
4840 :
4841 : /* Do not stick any code in here without much thought; it is assumed
4842 : that "continue" in the code above comes out to here to repeat the main
4843 : loop. */
4844 :
4845 : } /* End of main loop */
4846 : /* Control never reaches here */
4847 : }
4848 :
4849 :
4850 :
4851 :
4852 : /*************************************************
4853 : * Execute a Regular Expression *
4854 : *************************************************/
4855 :
4856 : /* This function applies a compiled re to a subject string and picks out
4857 : portions of the string if it matches. Two elements in the vector are set for
4858 : each substring: the offsets to the start and end of the substring.
4859 :
4860 : Arguments:
4861 : external_re points to the compiled expression
4862 : external_extra points to "hints" from pcre_study() or is NULL
4863 : subject points to the subject string
4864 : length length of subject string (may contain binary zeros)
4865 : start_offset where to start in the subject string
4866 : options option bits
4867 : offsets points to a vector of ints to be filled in with offsets
4868 : offsetcount the number of elements in the vector
4869 :
4870 : Returns: > 0 => success; value is the number of elements filled in
4871 : = 0 => success, but offsets is not big enough
4872 : -1 => failed to match
4873 : < -1 => some kind of unexpected problem
4874 : */
4875 :
4876 : __attribute__((no_sanitize("memory"))) __attribute__((no_sanitize_memory)) int
4877 0 : pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4878 : const char *subject, int length, int start_offset, int options, int *offsets,
4879 : int offsetcount)
4880 : {
4881 : int resetcount, ocount;
4882 0 : int first_char = -1;
4883 0 : int req_char = -1;
4884 0 : int req_char2 = -1;
4885 0 : unsigned long int ims = 0;
4886 : match_data match_block;
4887 0 : const uschar *start_bits = NULL;
4888 0 : const uschar *start_match = (const uschar *)subject + start_offset;
4889 : const uschar *end_subject;
4890 0 : const uschar *req_char_ptr = start_match - 1;
4891 0 : const real_pcre *re = (const real_pcre *)external_re;
4892 0 : const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4893 0 : BOOL using_temporary_offsets = FALSE;
4894 0 : BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4895 0 : BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4896 :
4897 0 : if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4898 :
4899 0 : if (re == NULL || subject == NULL ||
4900 0 : (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4901 0 : if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4902 :
4903 0 : match_block.start_pattern = re->code;
4904 0 : match_block.start_subject = (const uschar *)subject;
4905 0 : match_block.end_subject = match_block.start_subject + length;
4906 0 : end_subject = match_block.end_subject;
4907 :
4908 0 : match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4909 0 : match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4910 :
4911 0 : match_block.notbol = (options & PCRE_NOTBOL) != 0;
4912 0 : match_block.noteol = (options & PCRE_NOTEOL) != 0;
4913 0 : match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4914 :
4915 0 : match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4916 :
4917 0 : match_block.lcc = re->tables + lcc_offset;
4918 0 : match_block.ctypes = re->tables + ctypes_offset;
4919 :
4920 : /* The ims options can vary during the matching as a result of the presence
4921 : of (?ims) items in the pattern. They are kept in a local variable so that
4922 : restoring at the exit of a group is easy. */
4923 :
4924 0 : ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4925 :
4926 : /* If the expression has got more back references than the offsets supplied can
4927 : hold, we get a temporary bit of working store to use during the matching.
4928 : Otherwise, we can use the vector supplied, rounding down its size to a multiple
4929 : of 3. */
4930 :
4931 0 : ocount = offsetcount - (offsetcount % 3);
4932 :
4933 0 : if (re->top_backref > 0 && re->top_backref >= ocount/3)
4934 : {
4935 0 : ocount = re->top_backref * 3 + 3;
4936 0 : match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4937 0 : if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4938 0 : using_temporary_offsets = TRUE;
4939 : DPRINTF(("Got memory to hold back references\n"));
4940 : }
4941 0 : else match_block.offset_vector = offsets;
4942 :
4943 0 : match_block.offset_end = ocount;
4944 0 : match_block.offset_max = (2*ocount)/3;
4945 0 : match_block.offset_overflow = FALSE;
4946 :
4947 : /* Compute the minimum number of offsets that we need to reset each time. Doing
4948 : this makes a huge difference to execution time when there aren't many brackets
4949 : in the pattern. */
4950 :
4951 0 : resetcount = 2 + re->top_bracket * 2;
4952 0 : if (resetcount > offsetcount) resetcount = ocount;
4953 :
4954 : /* Reset the working variable associated with each extraction. These should
4955 : never be used unless previously set, but they get saved and restored, and so we
4956 : initialize them to avoid reading uninitialized locations. */
4957 :
4958 0 : if (match_block.offset_vector != NULL)
4959 : {
4960 0 : register int *iptr = match_block.offset_vector + ocount;
4961 0 : register int *iend = iptr - resetcount/2 + 1;
4962 0 : while (--iptr >= iend) *iptr = -1;
4963 : }
4964 :
4965 : /* Set up the first character to match, if available. The first_char value is
4966 : never set for an anchored regular expression, but the anchoring may be forced
4967 : at run time, so we have to test for anchoring. The first char may be unset for
4968 : an unanchored pattern, of course. If there's no first char and the pattern was
4969 : studied, there may be a bitmap of possible first characters. */
4970 :
4971 0 : if (!anchored)
4972 : {
4973 0 : if ((re->options & PCRE_FIRSTSET) != 0)
4974 : {
4975 0 : first_char = re->first_char;
4976 0 : if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4977 : }
4978 : else
4979 0 : if (!startline && extra != NULL &&
4980 0 : (extra->options & PCRE_STUDY_MAPPED) != 0)
4981 0 : start_bits = extra->start_bits;
4982 : }
4983 :
4984 : /* For anchored or unanchored matches, there may be a "last known required
4985 : character" set. If the PCRE_CASELESS is set, implying that the match starts
4986 : caselessly, or if there are any changes of this flag within the regex, set up
4987 : both cases of the character. Otherwise set the two values the same, which will
4988 : avoid duplicate testing (which takes significant time). This covers the vast
4989 : majority of cases. It will be suboptimal when the case flag changes in a regex
4990 : and the required character in fact is caseful. */
4991 :
4992 0 : if ((re->options & PCRE_REQCHSET) != 0)
4993 : {
4994 0 : req_char = re->req_char;
4995 0 : req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4996 0 : (re->tables + fcc_offset)[req_char] : req_char;
4997 : }
4998 :
4999 : /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5000 : the loop runs just once. */
5001 :
5002 : do
5003 : {
5004 : int rc;
5005 0 : register int *iptr = match_block.offset_vector;
5006 0 : register int *iend = iptr + resetcount;
5007 :
5008 : /* Reset the maximum number of extractions we might see. */
5009 :
5010 0 : while (iptr < iend) *iptr++ = -1;
5011 :
5012 : /* Advance to a unique first char if possible */
5013 :
5014 0 : if (first_char >= 0)
5015 : {
5016 0 : if ((ims & PCRE_CASELESS) != 0)
5017 0 : while (start_match < end_subject &&
5018 0 : match_block.lcc[*start_match] != first_char)
5019 0 : start_match++;
5020 : else
5021 0 : while (start_match < end_subject && *start_match != first_char)
5022 0 : start_match++;
5023 : }
5024 :
5025 : /* Or to just after \n for a multiline match if possible */
5026 :
5027 0 : else if (startline)
5028 : {
5029 0 : if (start_match > match_block.start_subject + start_offset)
5030 : {
5031 0 : while (start_match < end_subject && start_match[-1] != '\n')
5032 0 : start_match++;
5033 : }
5034 : }
5035 :
5036 : /* Or to a non-unique first char after study */
5037 :
5038 0 : else if (start_bits != NULL)
5039 : {
5040 0 : while (start_match < end_subject)
5041 : {
5042 0 : register int c = *start_match;
5043 0 : if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5044 : }
5045 : }
5046 :
5047 : #ifdef DEBUG /* Sigh. Some compilers never learn. */
5048 : printf(">>>> Match against: ");
5049 : pchars(start_match, end_subject - start_match, TRUE, &match_block);
5050 : printf("\n");
5051 : #endif
5052 :
5053 : /* If req_char is set, we know that that character must appear in the subject
5054 : for the match to succeed. If the first character is set, req_char must be
5055 : later in the subject; otherwise the test starts at the match point. This
5056 : optimization can save a huge amount of backtracking in patterns with nested
5057 : unlimited repeats that aren't going to match. We don't know what the state of
5058 : case matching may be when this character is hit, so test for it in both its
5059 : cases if necessary. However, the different cased versions will not be set up
5060 : unless PCRE_CASELESS was given or the casing state changes within the regex.
5061 : Writing separate code makes it go faster, as does using an autoincrement and
5062 : backing off on a match. */
5063 :
5064 0 : if (req_char >= 0)
5065 : {
5066 0 : register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5067 :
5068 : /* We don't need to repeat the search if we haven't yet reached the
5069 : place we found it at last time. */
5070 :
5071 0 : if (p > req_char_ptr)
5072 : {
5073 : /* Do a single test if no case difference is set up */
5074 :
5075 0 : if (req_char == req_char2)
5076 : {
5077 0 : while (p < end_subject)
5078 : {
5079 0 : if (*p++ == req_char) { p--; break; }
5080 : }
5081 : }
5082 :
5083 : /* Otherwise test for either case */
5084 :
5085 : else
5086 : {
5087 0 : while (p < end_subject)
5088 : {
5089 0 : register int pp = *p++;
5090 0 : if (pp == req_char || pp == req_char2) { p--; break; }
5091 : }
5092 : }
5093 :
5094 : /* If we can't find the required character, break the matching loop */
5095 :
5096 0 : if (p >= end_subject) break;
5097 :
5098 : /* If we have found the required character, save the point where we
5099 : found it, so that we don't search again next time round the loop if
5100 : the start hasn't passed this character yet. */
5101 :
5102 0 : req_char_ptr = p;
5103 : }
5104 : }
5105 :
5106 : /* When a match occurs, substrings will be set for all internal extractions;
5107 : we just need to set up the whole thing as substring 0 before returning. If
5108 : there were too many extractions, set the return code to zero. In the case
5109 : where we had to get some local store to hold offsets for backreferences, copy
5110 : those back references that we can. In this case there need not be overflow
5111 : if certain parts of the pattern were not used. */
5112 :
5113 0 : match_block.start_match = start_match;
5114 0 : if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5115 0 : continue;
5116 :
5117 : /* Copy the offset information from temporary store if necessary */
5118 :
5119 0 : if (using_temporary_offsets)
5120 : {
5121 0 : if (offsetcount >= 4)
5122 : {
5123 0 : memcpy(offsets + 2, match_block.offset_vector + 2,
5124 0 : (offsetcount - 2) * sizeof(int));
5125 : DPRINTF(("Copied offsets from temporary memory\n"));
5126 : }
5127 0 : if (match_block.end_offset_top > offsetcount)
5128 0 : match_block.offset_overflow = TRUE;
5129 :
5130 : DPRINTF(("Freeing temporary memory\n"));
5131 0 : (pcre_free)(match_block.offset_vector);
5132 : }
5133 :
5134 0 : rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5135 :
5136 0 : if (match_block.offset_end < 2) rc = 0; else
5137 : {
5138 0 : offsets[0] = start_match - match_block.start_subject;
5139 0 : offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5140 : }
5141 :
5142 : DPRINTF((">>>> returning %d\n", rc));
5143 0 : return rc;
5144 : }
5145 :
5146 : /* This "while" is the end of the "do" above */
5147 :
5148 0 : while (!anchored &&
5149 0 : match_block.errorcode == PCRE_ERROR_NOMATCH &&
5150 0 : start_match++ < end_subject);
5151 :
5152 0 : if (using_temporary_offsets)
5153 : {
5154 : DPRINTF(("Freeing temporary memory\n"));
5155 0 : (pcre_free)(match_block.offset_vector);
5156 : }
5157 :
5158 : DPRINTF((">>>> returning %d\n", match_block.errorcode));
5159 :
5160 0 : return match_block.errorcode;
5161 : }
5162 :
5163 : /* End of pcre.c */
|