Line data Source code
1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /*
6 : This is a library of functions to support regular expressions whose syntax
7 : and semantics are as close as possible to those of the Perl 5 language. See
8 : the file Tech.Notes for some information on the internals.
9 :
10 : Written by: Philip Hazel <ph10@cam.ac.uk>
11 :
12 : Copyright (c) 1997-2000 University of Cambridge
13 :
14 : -----------------------------------------------------------------------------
15 : Permission is granted to anyone to use this software for any purpose on any
16 : computer system, and to redistribute it freely, subject to the following
17 : restrictions:
18 :
19 : 1. This software is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 :
23 : 2. The origin of this software must not be misrepresented, either by
24 : explicit claim or by omission.
25 :
26 : 3. Altered versions must be plainly marked as such, and must not be
27 : misrepresented as being the original software.
28 :
29 : 4. If PCRE is embedded in any software that is released under the GNU
30 : General Purpose Licence (GPL), then the terms of that licence shall
31 : supersede any condition above with which it is incompatible.
32 : -----------------------------------------------------------------------------
33 : */
34 :
35 :
36 : /* Include the internals header, which itself includes Standard C headers plus
37 : the external pcre header. */
38 :
39 : #include "internal.h"
40 :
41 :
42 :
43 : /*************************************************
44 : * Set a bit and maybe its alternate case *
45 : *************************************************/
46 :
47 : /* Given a character, set its bit in the table, and also the bit for the other
48 : version of a letter if we are caseless.
49 :
50 : Arguments:
51 : start_bits points to the bit map
52 : c is the character
53 : caseless the caseless flag
54 : cd the block with char table pointers
55 :
56 : Returns: nothing
57 : */
58 :
59 : static void
60 0 : set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 : {
62 0 : start_bits[c/8] |= (1 << (c&7));
63 0 : if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64 0 : start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 0 : }
66 :
67 :
68 :
69 : /*************************************************
70 : * Create bitmap of starting chars *
71 : *************************************************/
72 :
73 : /* This function scans a compiled unanchored expression and attempts to build a
74 : bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 : goes by, we may be able to get more clever at doing this.
76 :
77 : Arguments:
78 : code points to an expression
79 : start_bits points to a 32-byte table, initialized to 0
80 : caseless the current state of the caseless flag
81 : cd the block with char table pointers
82 :
83 : Returns: TRUE if table built, FALSE otherwise
84 : */
85 :
86 : static BOOL
87 0 : set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88 : compile_data *cd)
89 : {
90 : register int c;
91 :
92 : /* This next statement and the later reference to dummy are here in order to
93 : trick the optimizer of the IBM C compiler for OS/2 into generating correct
94 : code. Apparently IBM isn't going to fix the problem, and we would rather not
95 : disable optimization (in this module it actually makes a big difference, and
96 : the pcre module can use all the optimization it can get). */
97 :
98 : volatile int dummy;
99 :
100 : do
101 : {
102 0 : const uschar *tcode = code + 3;
103 0 : BOOL try_next = TRUE;
104 :
105 0 : while (try_next)
106 : {
107 0 : try_next = FALSE;
108 :
109 : /* If a branch starts with a bracket or a positive lookahead assertion,
110 : recurse to set bits from within them. That's all for this branch. */
111 :
112 0 : if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
113 : {
114 0 : if (!set_start_bits(tcode, start_bits, caseless, cd))
115 0 : return FALSE;
116 : }
117 :
118 0 : else switch(*tcode)
119 : {
120 0 : default:
121 0 : return FALSE;
122 :
123 : /* Skip over lookbehind and negative lookahead assertions */
124 :
125 0 : case OP_ASSERT_NOT:
126 : case OP_ASSERTBACK:
127 : case OP_ASSERTBACK_NOT:
128 0 : try_next = TRUE;
129 0 : do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
130 0 : tcode += 3;
131 0 : break;
132 :
133 : /* Skip over an option setting, changing the caseless flag */
134 :
135 0 : case OP_OPT:
136 0 : caseless = (tcode[1] & PCRE_CASELESS) != 0;
137 0 : tcode += 2;
138 0 : try_next = TRUE;
139 0 : break;
140 :
141 : /* BRAZERO does the bracket, but carries on. */
142 :
143 0 : case OP_BRAZERO:
144 : case OP_BRAMINZERO:
145 0 : if (!set_start_bits(++tcode, start_bits, caseless, cd))
146 0 : return FALSE;
147 0 : dummy = 1;
148 0 : do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
149 0 : tcode += 3;
150 0 : try_next = TRUE;
151 0 : break;
152 :
153 : /* Single-char * or ? sets the bit and tries the next item */
154 :
155 0 : case OP_STAR:
156 : case OP_MINSTAR:
157 : case OP_QUERY:
158 : case OP_MINQUERY:
159 0 : set_bit(start_bits, tcode[1], caseless, cd);
160 0 : tcode += 2;
161 0 : try_next = TRUE;
162 0 : break;
163 :
164 : /* Single-char upto sets the bit and tries the next */
165 :
166 0 : case OP_UPTO:
167 : case OP_MINUPTO:
168 0 : set_bit(start_bits, tcode[3], caseless, cd);
169 0 : tcode += 4;
170 0 : try_next = TRUE;
171 0 : break;
172 :
173 : /* At least one single char sets the bit and stops */
174 :
175 0 : case OP_EXACT: /* Fall through */
176 0 : tcode++;
177 :
178 0 : case OP_CHARS: /* Fall through */
179 0 : tcode++;
180 :
181 0 : case OP_PLUS:
182 : case OP_MINPLUS:
183 0 : set_bit(start_bits, tcode[1], caseless, cd);
184 0 : break;
185 :
186 : /* Single character type sets the bits and stops */
187 :
188 0 : case OP_NOT_DIGIT:
189 0 : for (c = 0; c < 32; c++)
190 0 : start_bits[c] |= ~cd->cbits[c+cbit_digit];
191 0 : break;
192 :
193 0 : case OP_DIGIT:
194 0 : for (c = 0; c < 32; c++)
195 0 : start_bits[c] |= cd->cbits[c+cbit_digit];
196 0 : break;
197 :
198 0 : case OP_NOT_WHITESPACE:
199 0 : for (c = 0; c < 32; c++)
200 0 : start_bits[c] |= ~cd->cbits[c+cbit_space];
201 0 : break;
202 :
203 0 : case OP_WHITESPACE:
204 0 : for (c = 0; c < 32; c++)
205 0 : start_bits[c] |= cd->cbits[c+cbit_space];
206 0 : break;
207 :
208 0 : case OP_NOT_WORDCHAR:
209 0 : for (c = 0; c < 32; c++)
210 0 : start_bits[c] |= ~cd->cbits[c+cbit_word];
211 0 : break;
212 :
213 0 : case OP_WORDCHAR:
214 0 : for (c = 0; c < 32; c++)
215 0 : start_bits[c] |= cd->cbits[c+cbit_word];
216 0 : break;
217 :
218 : /* One or more character type fudges the pointer and restarts, knowing
219 : it will hit a single character type and stop there. */
220 :
221 0 : case OP_TYPEPLUS:
222 : case OP_TYPEMINPLUS:
223 0 : tcode++;
224 0 : try_next = TRUE;
225 0 : break;
226 :
227 0 : case OP_TYPEEXACT:
228 0 : tcode += 3;
229 0 : try_next = TRUE;
230 0 : break;
231 :
232 : /* Zero or more repeats of character types set the bits and then
233 : try again. */
234 :
235 0 : case OP_TYPEUPTO:
236 : case OP_TYPEMINUPTO:
237 0 : tcode += 2; /* Fall through */
238 :
239 0 : case OP_TYPESTAR:
240 : case OP_TYPEMINSTAR:
241 : case OP_TYPEQUERY:
242 : case OP_TYPEMINQUERY:
243 0 : switch(tcode[1])
244 : {
245 0 : case OP_NOT_DIGIT:
246 0 : for (c = 0; c < 32; c++)
247 0 : start_bits[c] |= ~cd->cbits[c+cbit_digit];
248 0 : break;
249 :
250 0 : case OP_DIGIT:
251 0 : for (c = 0; c < 32; c++)
252 0 : start_bits[c] |= cd->cbits[c+cbit_digit];
253 0 : break;
254 :
255 0 : case OP_NOT_WHITESPACE:
256 0 : for (c = 0; c < 32; c++)
257 0 : start_bits[c] |= ~cd->cbits[c+cbit_space];
258 0 : break;
259 :
260 0 : case OP_WHITESPACE:
261 0 : for (c = 0; c < 32; c++)
262 0 : start_bits[c] |= cd->cbits[c+cbit_space];
263 0 : break;
264 :
265 0 : case OP_NOT_WORDCHAR:
266 0 : for (c = 0; c < 32; c++)
267 0 : start_bits[c] |= ~cd->cbits[c+cbit_word];
268 0 : break;
269 :
270 0 : case OP_WORDCHAR:
271 0 : for (c = 0; c < 32; c++)
272 0 : start_bits[c] |= cd->cbits[c+cbit_word];
273 0 : break;
274 : }
275 :
276 0 : tcode += 2;
277 0 : try_next = TRUE;
278 0 : break;
279 :
280 : /* Character class: set the bits and either carry on or not,
281 : according to the repeat count. */
282 :
283 0 : case OP_CLASS:
284 : {
285 0 : tcode++;
286 0 : for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
287 0 : tcode += 32;
288 0 : switch (*tcode)
289 : {
290 0 : case OP_CRSTAR:
291 : case OP_CRMINSTAR:
292 : case OP_CRQUERY:
293 : case OP_CRMINQUERY:
294 0 : tcode++;
295 0 : try_next = TRUE;
296 0 : break;
297 :
298 0 : case OP_CRRANGE:
299 : case OP_CRMINRANGE:
300 0 : if (((tcode[1] << 8) + tcode[2]) == 0)
301 : {
302 0 : tcode += 5;
303 0 : try_next = TRUE;
304 : }
305 0 : break;
306 : }
307 0 : }
308 0 : break; /* End of class handling */
309 :
310 : } /* End of switch */
311 : } /* End of try_next loop */
312 :
313 0 : code += (code[1] << 8) + code[2]; /* Advance to next branch */
314 : }
315 0 : while (*code == OP_ALT);
316 0 : return TRUE;
317 : }
318 :
319 :
320 :
321 : /*************************************************
322 : * Study a compiled expression *
323 : *************************************************/
324 :
325 : /* This function is handed a compiled expression that it must study to produce
326 : information that will speed up the matching. It returns a pcre_extra block
327 : which then gets handed back to pcre_exec().
328 :
329 : Arguments:
330 : re points to the compiled expression
331 : options contains option bits
332 : errorptr points to where to place error messages;
333 : set NULL unless error
334 :
335 : Returns: pointer to a pcre_extra block,
336 : NULL on error or if no optimization possible
337 : */
338 :
339 : pcre_extra *
340 0 : pcre_study(const pcre *external_re, int options, const char **errorptr)
341 : {
342 : uschar start_bits[32];
343 : real_pcre_extra *extra;
344 0 : const real_pcre *re = (const real_pcre *)external_re;
345 : compile_data compile_block;
346 :
347 0 : *errorptr = NULL;
348 :
349 0 : if (re == NULL || re->magic_number != MAGIC_NUMBER)
350 : {
351 0 : *errorptr = "argument is not a compiled regular expression";
352 0 : return NULL;
353 : }
354 :
355 0 : if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
356 : {
357 0 : *errorptr = "unknown or incorrect option bit(s) set";
358 0 : return NULL;
359 : }
360 :
361 : /* For an anchored pattern, or an unchored pattern that has a first char, or a
362 : multiline pattern that matches only at "line starts", no further processing at
363 : present. */
364 :
365 0 : if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
366 0 : return NULL;
367 :
368 : /* Set the character tables in the block which is passed around */
369 :
370 0 : compile_block.lcc = re->tables + lcc_offset;
371 0 : compile_block.fcc = re->tables + fcc_offset;
372 0 : compile_block.cbits = re->tables + cbits_offset;
373 0 : compile_block.ctypes = re->tables + ctypes_offset;
374 :
375 : /* See if we can find a fixed set of initial characters for the pattern. */
376 :
377 0 : memset(start_bits, 0, 32 * sizeof(uschar));
378 0 : if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
379 0 : &compile_block)) return NULL;
380 :
381 : /* Get an "extra" block and put the information therein. */
382 :
383 0 : extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
384 :
385 0 : if (extra == NULL)
386 : {
387 0 : *errorptr = "failed to get memory";
388 0 : return NULL;
389 : }
390 :
391 0 : extra->options = PCRE_STUDY_MAPPED;
392 0 : memcpy(extra->start_bits, start_bits, sizeof(start_bits));
393 :
394 0 : return (pcre_extra *)extra;
395 : }
396 :
397 : /* End of study.c */
|