Line data Source code
1 : /*********************************************************************
2 : *
3 : * File : $Source: /cvsroot/ijbswa/current/pcrs.c,v $
4 : *
5 : * Purpose : pcrs is a supplement to the pcre library by Philip Hazel
6 : * <ph10@cam.ac.uk> and adds Perl-style substitution. That
7 : * is, it mimics Perl's 's' operator. See pcrs(3) for details.
8 : *
9 : * WARNING: This file contains additional functions and bug
10 : * fixes that aren't part of the latest official pcrs package
11 : * (which apparently is no longer maintained).
12 : *
13 : * Copyright : Written and Copyright (C) 2000, 2001 by Andreas S. Oesterhelt
14 : * <andreas@oesterhelt.org>
15 : *
16 : * Copyright (C) 2006, 2007 Fabian Keil <fk@fabiankeil.de>
17 : *
18 : * This program is free software; you can redistribute it
19 : * and/or modify it under the terms of the GNU General
20 : * Public License as published by the Free Software
21 : * Foundation; either version 2 of the License, or (at
22 : * your option) any later version.
23 : *
24 : * This program is distributed in the hope that it will
25 : * be useful, but WITHOUT ANY WARRANTY; without even the
26 : * implied warranty of MERCHANTABILITY or FITNESS FOR A
27 : * PARTICULAR PURPOSE. See the GNU General Public
28 : * License for more details.
29 : *
30 : * The GNU General Public License should be included with
31 : * this file. If not, you can view it at
32 : * http://www.gnu.org/copyleft/gpl.html
33 : * or write to the Free Software Foundation, Inc., 59
34 : * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
35 : *
36 : *********************************************************************/
37 :
38 :
39 : #include <string.h>
40 : #include <ctype.h>
41 : #include <assert.h>
42 :
43 : /*
44 : * Include project.h just so that the right pcre.h gets
45 : * included from there
46 : */
47 : #include "project.h"
48 :
49 : /* For snprintf only */
50 : #include "miscutil.h"
51 : /* For xtoi */
52 : #include "encode.h"
53 :
54 : #include "pcrs.h"
55 :
56 : /*
57 : * Internal prototypes
58 : */
59 :
60 : static int pcrs_parse_perl_options(const char *optstring, int *flags);
61 : static pcrs_substitute *pcrs_compile_replacement(const char *replacement, int trivialflag,
62 : int capturecount, int *errptr);
63 : static int is_hex_sequence(const char *sequence);
64 :
65 : /*********************************************************************
66 : *
67 : * Function : pcrs_strerror
68 : *
69 : * Description : Return a string describing a given error code.
70 : *
71 : * Parameters :
72 : * 1 : error = the error code
73 : *
74 : * Returns : char * to the descriptive string
75 : *
76 : *********************************************************************/
77 1 : const char *pcrs_strerror(const int error)
78 : {
79 : static char buf[100];
80 :
81 1 : if (error != 0)
82 : {
83 1 : switch (error)
84 : {
85 : /* Passed-through PCRE error: */
86 0 : case PCRE_ERROR_NOMEMORY: return "(pcre:) No memory";
87 :
88 : /* Shouldn't happen unless PCRE or PCRS bug, or user messed with compiled job: */
89 0 : case PCRE_ERROR_NULL: return "(pcre:) NULL code or subject or ovector";
90 0 : case PCRE_ERROR_BADOPTION: return "(pcre:) Unrecognized option bit";
91 0 : case PCRE_ERROR_BADMAGIC: return "(pcre:) Bad magic number in code";
92 0 : case PCRE_ERROR_UNKNOWN_NODE: return "(pcre:) Bad node in pattern";
93 :
94 : /* Can't happen / not passed: */
95 0 : case PCRE_ERROR_NOSUBSTRING: return "(pcre:) Fire in power supply";
96 0 : case PCRE_ERROR_NOMATCH: return "(pcre:) Water in power supply";
97 :
98 : #ifdef PCRE_ERROR_MATCHLIMIT
99 : /*
100 : * Only reported by PCRE versions newer than our own.
101 : */
102 0 : case PCRE_ERROR_MATCHLIMIT: return "(pcre:) Match limit reached";
103 : #endif /* def PCRE_ERROR_MATCHLIMIT */
104 :
105 : /* PCRS errors: */
106 0 : case PCRS_ERR_NOMEM: return "(pcrs:) No memory";
107 0 : case PCRS_ERR_CMDSYNTAX: return "(pcrs:) Syntax error while parsing command";
108 0 : case PCRS_ERR_STUDY: return "(pcrs:) PCRE error while studying the pattern";
109 0 : case PCRS_ERR_BADJOB: return "(pcrs:) Bad job - NULL job, pattern or substitute";
110 0 : case PCRS_WARN_BADREF: return "(pcrs:) Backreference out of range";
111 0 : case PCRS_WARN_TRUNCATION:
112 0 : return "(pcrs:) At least one variable was too big and has been truncated before compilation";
113 :
114 : /*
115 : * XXX: With the exception of PCRE_ERROR_MATCHLIMIT we
116 : * only catch PCRE errors that can happen with our internal
117 : * version. If Privoxy is linked against a newer
118 : * PCRE version all bets are off ...
119 : */
120 1 : default:
121 1 : snprintf(buf, sizeof(buf),
122 : "Error code %d. For details, check the pcre documentation.",
123 : error);
124 1 : return buf;
125 : }
126 : }
127 : /* error >= 0: No error */
128 0 : return "(pcrs:) Everything's just fine. Thanks for asking.";
129 :
130 : }
131 :
132 :
133 : /*********************************************************************
134 : *
135 : * Function : pcrs_parse_perl_options
136 : *
137 : * Description : This function parses a string containing the options to
138 : * Perl's s/// operator. It returns an integer that is the
139 : * pcre equivalent of the symbolic optstring.
140 : * Since pcre doesn't know about Perl's 'g' (global) or pcrs',
141 : * 'T' (trivial) options but pcrs needs them, the corresponding
142 : * flags are set if 'g'or 'T' is encountered.
143 : * Note: The 'T' and 'U' options do not conform to Perl.
144 : *
145 : * Parameters :
146 : * 1 : optstring = string with options in perl syntax
147 : * 2 : flags = see description
148 : *
149 : * Returns : option integer suitable for pcre
150 : *
151 : *********************************************************************/
152 1261126 : static int pcrs_parse_perl_options(const char *optstring, int *flags)
153 : {
154 : size_t i;
155 1261126 : int rc = 0;
156 1261126 : *flags = 0;
157 :
158 1261126 : if (NULL == optstring) return 0;
159 :
160 7320664 : for (i = 0; i < strlen(optstring); i++)
161 : {
162 6059538 : switch(optstring[i])
163 : {
164 0 : case 'e': break; /* ToDo ;-) */
165 1236072 : case 'g': *flags |= PCRS_GLOBAL; break;
166 1257758 : case 'i': rc |= PCRE_CASELESS; break;
167 0 : case 'm': rc |= PCRE_MULTILINE; break;
168 0 : case 'o': break;
169 1198896 : case 's': rc |= PCRE_DOTALL; break;
170 0 : case 'x': rc |= PCRE_EXTENDED; break;
171 0 : case 'D': *flags |= PCRS_DYNAMIC; break;
172 1217484 : case 'U': rc |= PCRE_UNGREEDY; break;
173 1149328 : case 'T': *flags |= PCRS_TRIVIAL; break;
174 0 : default: break;
175 : }
176 : }
177 1261126 : return rc;
178 :
179 : }
180 :
181 :
182 : #ifdef FUZZ
183 : /*********************************************************************
184 : *
185 : * Function : pcrs_compile_fuzzed_replacement
186 : *
187 : * Description : Wrapper around pcrs_compile_replacement() for
188 : * fuzzing purposes.
189 : *
190 : * Parameters :
191 : * 1 : replacement = replacement part of s/// operator
192 : * in perl syntax
193 : * 2 : errptr = pointer to an integer in which error
194 : * conditions can be returned.
195 : *
196 : * Returns : pcrs_substitute data structure, or NULL if an
197 : * error is encountered. In that case, *errptr has
198 : * the reason.
199 : *
200 : *********************************************************************/
201 0 : extern pcrs_substitute *pcrs_compile_fuzzed_replacement(const char *replacement, int *errptr)
202 : {
203 0 : int capturecount = PCRS_MAX_SUBMATCHES; /* XXX: fuzzworthy? */
204 0 : int trivial_flag = 0; /* We don't want to fuzz strncpy() */
205 :
206 0 : *errptr = 0; /* XXX: Should pcrs_compile_replacement() do this? */
207 :
208 0 : return pcrs_compile_replacement(replacement, trivial_flag, capturecount, errptr);
209 :
210 : }
211 : #endif
212 :
213 :
214 : /*********************************************************************
215 : *
216 : * Function : pcrs_compile_replacement
217 : *
218 : * Description : This function takes a Perl-style replacement (2nd argument
219 : * to the s/// operator and returns a compiled pcrs_substitute,
220 : * or NULL if memory allocation for the substitute structure
221 : * fails.
222 : *
223 : * Parameters :
224 : * 1 : replacement = replacement part of s/// operator
225 : * in perl syntax
226 : * 2 : trivialflag = Flag that causes backreferences to be
227 : * ignored.
228 : * 3 : capturecount = Number of capturing subpatterns in
229 : * the pattern. Needed for $+ handling.
230 : * 4 : errptr = pointer to an integer in which error
231 : * conditions can be returned.
232 : *
233 : * Returns : pcrs_substitute data structure, or NULL if an
234 : * error is encountered. In that case, *errptr has
235 : * the reason.
236 : *
237 : *********************************************************************/
238 1261126 : static pcrs_substitute *pcrs_compile_replacement(const char *replacement, int trivialflag, int capturecount, int *errptr)
239 : {
240 : int i, k, l, quoted;
241 : char *text;
242 : pcrs_substitute *r;
243 : #ifndef FUZZ
244 : size_t length;
245 : #else
246 : static size_t length;
247 : #endif
248 1261126 : i = k = l = quoted = 0;
249 :
250 : /*
251 : * Sanity check
252 : */
253 1261126 : if (NULL == replacement)
254 : {
255 0 : replacement = "";
256 : }
257 :
258 : /*
259 : * Get memory or fail
260 : */
261 1261126 : if (NULL == (r = (pcrs_substitute *)malloc(sizeof(pcrs_substitute))))
262 : {
263 0 : *errptr = PCRS_ERR_NOMEM;
264 0 : return NULL;
265 : }
266 1261126 : memset(r, '\0', sizeof(pcrs_substitute));
267 :
268 1261126 : length = strlen(replacement);
269 :
270 1261126 : if (NULL == (text = (char *)malloc(length + 1)))
271 : {
272 0 : free(r);
273 0 : *errptr = PCRS_ERR_NOMEM;
274 0 : return NULL;
275 : }
276 1261126 : memset(text, '\0', length + 1);
277 :
278 :
279 : /*
280 : * In trivial mode, just copy the substitute text
281 : */
282 1261126 : if (trivialflag)
283 : {
284 1149328 : strlcpy(text, replacement, length + 1);
285 1149328 : k = (int)length;
286 : }
287 :
288 : /*
289 : * Else, parse, cut out and record all backreferences
290 : */
291 : else
292 : {
293 1764922 : while (i < (int)length)
294 : {
295 : /* Quoting */
296 1653124 : if (replacement[i] == '\\')
297 : {
298 0 : if (quoted)
299 : {
300 0 : text[k++] = replacement[i++];
301 0 : quoted = 0;
302 : }
303 : else
304 : {
305 0 : if (replacement[i+1] && strchr("tnrfae0", replacement[i+1]))
306 : {
307 0 : switch (replacement[++i])
308 : {
309 0 : case 't':
310 0 : text[k++] = '\t';
311 0 : break;
312 0 : case 'n':
313 0 : text[k++] = '\n';
314 0 : break;
315 0 : case 'r':
316 0 : text[k++] = '\r';
317 0 : break;
318 0 : case 'f':
319 0 : text[k++] = '\f';
320 0 : break;
321 0 : case 'a':
322 0 : text[k++] = 7;
323 0 : break;
324 0 : case 'e':
325 0 : text[k++] = 27;
326 0 : break;
327 0 : case '0':
328 0 : text[k++] = '\0';
329 0 : break;
330 : }
331 0 : i++;
332 : }
333 0 : else if (is_hex_sequence(&replacement[i]))
334 : {
335 : /*
336 : * Replace a hex sequence with a single
337 : * character with the sequence's ascii value.
338 : * e.g.: '\x7e' => '~'
339 : */
340 0 : const int ascii_value = xtoi(&replacement[i+2]);
341 :
342 0 : assert(ascii_value >= 0);
343 0 : assert(ascii_value < 256);
344 0 : text[k++] = (char)ascii_value;
345 0 : i += 4;
346 : }
347 : else
348 : {
349 0 : quoted = 1;
350 0 : i++;
351 : }
352 : }
353 0 : continue;
354 : }
355 :
356 : /* Backreferences */
357 1653124 : if (replacement[i] == '$' && !quoted && i < (int)(length - 1))
358 : {
359 195174 : char *symbol, symbols[] = "'`+&";
360 195174 : if (l >= PCRS_MAX_SUBMATCHES)
361 : {
362 0 : freez(text);
363 0 : freez(r);
364 0 : *errptr = PCRS_WARN_BADREF;
365 0 : return NULL;
366 : }
367 195174 : r->block_length[l] = (size_t)(k - r->block_offset[l]);
368 :
369 : /* Numerical backreferences */
370 195174 : if (isdigit((int)replacement[i + 1]))
371 : {
372 390348 : while (i < (int)length && isdigit((int)replacement[++i]))
373 : {
374 195174 : r->backref[l] = r->backref[l] * 10 + replacement[i] - 48;
375 : }
376 195174 : if (r->backref[l] > capturecount)
377 : {
378 0 : freez(text);
379 0 : freez(r);
380 0 : *errptr = PCRS_WARN_BADREF;
381 0 : return NULL;
382 : }
383 : }
384 :
385 : /* Symbolic backreferences: */
386 0 : else if (NULL != (symbol = strchr(symbols, replacement[i + 1])))
387 : {
388 :
389 0 : if (symbol - symbols == 2) /* $+ */
390 : {
391 0 : r->backref[l] = capturecount;
392 : }
393 0 : else if (symbol - symbols == 3) /* $& */
394 : {
395 0 : r->backref[l] = 0;
396 : }
397 : else /* $' or $` */
398 : {
399 0 : r->backref[l] = (int)(PCRS_MAX_SUBMATCHES + 1 - (symbol - symbols));
400 : }
401 0 : i += 2;
402 : }
403 :
404 : /* Invalid backref -> plain '$' */
405 : else
406 : {
407 0 : goto plainchar;
408 : }
409 :
410 195174 : assert(r->backref[l] < PCRS_MAX_SUBMATCHES + 2);
411 : /* Valid and in range? -> record */
412 195174 : if ((0 <= r->backref[l]) &&
413 195174 : (r->backref[l] < PCRS_MAX_SUBMATCHES + 2) &&
414 : (l < PCRS_MAX_SUBMATCHES - 1))
415 : {
416 195174 : r->backref_count[r->backref[l]] += 1;
417 195174 : r->block_offset[++l] = k;
418 : }
419 : else
420 : {
421 0 : freez(text);
422 0 : freez(r);
423 0 : *errptr = PCRS_WARN_BADREF;
424 0 : return NULL;
425 : }
426 195174 : continue;
427 : }
428 :
429 1457950 : plainchar:
430 : /* Plain chars are copied */
431 1457950 : text[k++] = replacement[i++];
432 1457950 : quoted = 0;
433 : }
434 : } /* -END- if (!trivialflag) */
435 :
436 : /*
437 : * Finish & return
438 : */
439 1261126 : r->text = text;
440 1261126 : r->backrefs = l;
441 1261126 : r->length = (size_t)k;
442 1261126 : r->block_length[l] = (size_t)(k - r->block_offset[l]);
443 :
444 1261126 : return r;
445 :
446 : }
447 :
448 :
449 : /*********************************************************************
450 : *
451 : * Function : pcrs_free_job
452 : *
453 : * Description : Frees the memory used by a pcrs_job struct and its
454 : * dependent structures.
455 : *
456 : * Parameters :
457 : * 1 : job = pointer to the pcrs_job structure to be freed
458 : *
459 : * Returns : a pointer to the next job, if there was any, or
460 : * NULL otherwise.
461 : *
462 : *********************************************************************/
463 1149598 : pcrs_job *pcrs_free_job(pcrs_job *job)
464 : {
465 : pcrs_job *next;
466 :
467 1149598 : if (job == NULL)
468 : {
469 0 : return NULL;
470 : }
471 : else
472 : {
473 1149598 : next = job->next;
474 1149598 : if (job->pattern != NULL) free(job->pattern);
475 1149598 : if (job->hints != NULL)
476 : {
477 : #ifdef PCRE_CONFIG_JIT
478 1149598 : pcre_free_study(job->hints);
479 : #else
480 : free(job->hints);
481 : #endif
482 : }
483 1149598 : if (job->substitute != NULL)
484 : {
485 1149598 : if (job->substitute->text != NULL) free(job->substitute->text);
486 1149598 : free(job->substitute);
487 : }
488 1149598 : free(job);
489 : }
490 1149598 : return next;
491 :
492 : }
493 :
494 :
495 : /*********************************************************************
496 : *
497 : * Function : pcrs_free_joblist
498 : *
499 : * Description : Iterates through a chained list of pcrs_job's and
500 : * frees them using pcrs_free_job.
501 : *
502 : * Parameters :
503 : * 1 : joblist = pointer to the first pcrs_job structure to
504 : * be freed
505 : *
506 : * Returns : N/A
507 : *
508 : *********************************************************************/
509 0 : void pcrs_free_joblist(pcrs_job *joblist)
510 : {
511 0 : while (NULL != (joblist = pcrs_free_job(joblist))) {};
512 :
513 0 : return;
514 :
515 : }
516 :
517 :
518 : /*********************************************************************
519 : *
520 : * Function : pcrs_compile_command
521 : *
522 : * Description : Parses a string with a Perl-style s/// command,
523 : * calls pcrs_compile, and returns a corresponding
524 : * pcrs_job, or NULL if parsing or compiling the job
525 : * fails.
526 : *
527 : * Parameters :
528 : * 1 : command = string with perl-style s/// command
529 : * 2 : errptr = pointer to an integer in which error
530 : * conditions can be returned.
531 : *
532 : * Returns : a corresponding pcrs_job data structure, or NULL
533 : * if an error was encountered. In that case, *errptr
534 : * has the reason.
535 : *
536 : *********************************************************************/
537 111798 : pcrs_job *pcrs_compile_command(const char *command, int *errptr)
538 : {
539 111798 : int i, k, l, quoted = FALSE;
540 : size_t limit;
541 : char delimiter;
542 : char *tokens[4];
543 : pcrs_job *newjob;
544 :
545 111798 : k = l = 0;
546 :
547 : /*
548 : * Tokenize the perl command
549 : */
550 111798 : limit = strlen(command);
551 111798 : if (limit < 4)
552 : {
553 0 : *errptr = PCRS_ERR_CMDSYNTAX;
554 0 : return NULL;
555 : }
556 : else
557 : {
558 111798 : delimiter = command[1];
559 : }
560 :
561 111798 : tokens[l] = (char *) malloc(limit + 1);
562 :
563 6938454 : for (i = 0; i <= (int)limit; i++)
564 : {
565 :
566 6826656 : if (command[i] == delimiter && !quoted)
567 : {
568 335394 : if (l == 3)
569 : {
570 0 : l = -1;
571 0 : break;
572 : }
573 335394 : tokens[0][k++] = '\0';
574 335394 : tokens[++l] = tokens[0] + k;
575 335394 : continue;
576 : }
577 :
578 6491262 : else if (command[i] == '\\' && !quoted)
579 : {
580 275722 : quoted = TRUE;
581 275722 : if (command[i+1] == delimiter) continue;
582 : }
583 : else
584 : {
585 6215540 : quoted = FALSE;
586 : }
587 6491262 : tokens[0][k++] = command[i];
588 : }
589 :
590 : /*
591 : * Syntax error ?
592 : */
593 111798 : if (l != 3)
594 : {
595 0 : *errptr = PCRS_ERR_CMDSYNTAX;
596 0 : free(tokens[0]);
597 0 : return NULL;
598 : }
599 :
600 111798 : newjob = pcrs_compile(tokens[1], tokens[2], tokens[3], errptr);
601 111798 : free(tokens[0]);
602 111798 : return newjob;
603 :
604 : }
605 :
606 :
607 : /*********************************************************************
608 : *
609 : * Function : pcrs_compile
610 : *
611 : * Description : Takes the three arguments to a perl s/// command
612 : * and compiles a pcrs_job structure from them.
613 : *
614 : * Parameters :
615 : * 1 : pattern = string with perl-style pattern
616 : * 2 : substitute = string with perl-style substitute
617 : * 3 : options = string with perl-style options
618 : * 4 : errptr = pointer to an integer in which error
619 : * conditions can be returned.
620 : *
621 : * Returns : a corresponding pcrs_job data structure, or NULL
622 : * if an error was encountered. In that case, *errptr
623 : * has the reason.
624 : *
625 : *********************************************************************/
626 1261126 : pcrs_job *pcrs_compile(const char *pattern, const char *substitute, const char *options, int *errptr)
627 : {
628 : pcrs_job *newjob;
629 : int flags;
630 : int capturecount;
631 : const char *error;
632 1261126 : int pcre_study_options = 0;
633 :
634 1261126 : *errptr = 0;
635 :
636 : /*
637 : * Handle NULL arguments
638 : */
639 1261126 : if (pattern == NULL) pattern = "";
640 1261126 : if (substitute == NULL) substitute = "";
641 :
642 :
643 : /*
644 : * Get and init memory
645 : */
646 1261126 : if (NULL == (newjob = (pcrs_job *)malloc(sizeof(pcrs_job))))
647 : {
648 0 : *errptr = PCRS_ERR_NOMEM;
649 0 : return NULL;
650 : }
651 1261126 : memset(newjob, '\0', sizeof(pcrs_job));
652 :
653 :
654 : /*
655 : * Evaluate the options
656 : */
657 1261126 : newjob->options = pcrs_parse_perl_options(options, &flags);
658 1261126 : newjob->flags = flags;
659 :
660 :
661 : /*
662 : * Compile the pattern
663 : */
664 1261126 : error = NULL;
665 1261126 : newjob->pattern = pcre_compile(pattern, newjob->options, &error, errptr, NULL);
666 1261126 : if (newjob->pattern == NULL)
667 : {
668 0 : pcrs_free_job(newjob);
669 0 : return NULL;
670 : }
671 :
672 :
673 : #ifdef PCRE_STUDY_JIT_COMPILE
674 1261126 : if (!(flags & PCRS_DYNAMIC))
675 : {
676 1261126 : pcre_study_options = PCRE_STUDY_JIT_COMPILE;
677 : }
678 : #endif
679 :
680 : /*
681 : * Generate hints. This has little overhead, since the
682 : * hints will be NULL for a boring pattern anyway.
683 : */
684 1261126 : newjob->hints = pcre_study(newjob->pattern, pcre_study_options, &error);
685 1261126 : if (error != NULL)
686 : {
687 0 : *errptr = PCRS_ERR_STUDY;
688 0 : pcrs_free_job(newjob);
689 0 : return NULL;
690 : }
691 :
692 :
693 : /*
694 : * Determine the number of capturing subpatterns.
695 : * This is needed for handling $+ in the substitute.
696 : */
697 1261126 : if (0 > (*errptr = pcre_fullinfo(newjob->pattern, newjob->hints, PCRE_INFO_CAPTURECOUNT, &capturecount)))
698 : {
699 0 : pcrs_free_job(newjob);
700 0 : return NULL;
701 : }
702 :
703 :
704 : /*
705 : * Compile the substitute
706 : */
707 1261126 : if (NULL == (newjob->substitute = pcrs_compile_replacement(substitute, newjob->flags & PCRS_TRIVIAL, capturecount, errptr)))
708 : {
709 0 : pcrs_free_job(newjob);
710 0 : return NULL;
711 : }
712 :
713 1261126 : return newjob;
714 :
715 : }
716 :
717 :
718 : /*********************************************************************
719 : *
720 : * Function : pcrs_execute_list
721 : *
722 : * Description : This is a multiple job wrapper for pcrs_execute().
723 : * Apply the regular substitutions defined by the jobs in
724 : * the joblist to the subject.
725 : * The subject itself is left untouched, memory for the result
726 : * is malloc()ed and it is the caller's responsibility to free
727 : * the result when it's no longer needed.
728 : *
729 : * Note: For convenient string handling, a null byte is
730 : * appended to the result. It does not count towards the
731 : * result_length, though.
732 : *
733 : *
734 : * Parameters :
735 : * 1 : joblist = the chained list of pcrs_jobs to be executed
736 : * 2 : subject = the subject string
737 : * 3 : subject_length = the subject's length
738 : * 4 : result = char** for returning the result
739 : * 5 : result_length = size_t* for returning the result's length
740 : *
741 : * Returns : On success, the number of substitutions that were made.
742 : * May be > 1 if job->flags contained PCRS_GLOBAL
743 : * On failure, the (negative) pcre error code describing the
744 : * failure, which may be translated to text using pcrs_strerror().
745 : *
746 : *********************************************************************/
747 0 : int pcrs_execute_list(pcrs_job *joblist, char *subject, size_t subject_length, char **result, size_t *result_length)
748 : {
749 : pcrs_job *job;
750 0 : char *old, *new = NULL;
751 : int hits, total_hits;
752 :
753 0 : old = subject;
754 0 : *result_length = subject_length;
755 0 : total_hits = 0;
756 :
757 0 : for (job = joblist; job != NULL; job = job->next)
758 : {
759 0 : hits = pcrs_execute(job, old, *result_length, &new, result_length);
760 :
761 0 : if (old != subject) free(old);
762 :
763 0 : if (hits < 0)
764 : {
765 0 : return(hits);
766 : }
767 : else
768 : {
769 0 : total_hits += hits;
770 0 : old = new;
771 : }
772 : }
773 :
774 0 : *result = new;
775 0 : return(total_hits);
776 :
777 : }
778 :
779 :
780 : /*********************************************************************
781 : *
782 : * Function : pcrs_execute
783 : *
784 : * Description : Apply the regular substitution defined by the job to the
785 : * subject.
786 : * The subject itself is left untouched, memory for the result
787 : * is malloc()ed and it is the caller's responsibility to free
788 : * the result when it's no longer needed.
789 : *
790 : * Note: For convenient string handling, a null byte is
791 : * appended to the result. It does not count towards the
792 : * result_length, though.
793 : *
794 : * Parameters :
795 : * 1 : job = the pcrs_job to be executed
796 : * 2 : subject = the subject (== original) string
797 : * 3 : subject_length = the subject's length
798 : * 4 : result = char** for returning the result (NULL on error)
799 : * 5 : result_length = size_t* for returning the result's length
800 : *
801 : * Returns : On success, the number of substitutions that were made.
802 : * May be > 1 if job->flags contained PCRS_GLOBAL
803 : * On failure, the (negative) pcre error code describing the
804 : * failure, which may be translated to text using pcrs_strerror().
805 : *
806 : *********************************************************************/
807 2842638 : int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char **result, size_t *result_length)
808 : {
809 : int offsets[3 * PCRS_MAX_SUBMATCHES],
810 : offset,
811 : i, k,
812 : matches_found,
813 : submatches,
814 2842638 : max_matches = PCRS_MAX_MATCH_INIT;
815 : size_t newsize;
816 : pcrs_match *matches, *dummy;
817 : char *result_offset;
818 :
819 2842638 : offset = i = 0;
820 2842638 : *result = NULL;
821 :
822 : /*
823 : * Sanity check & memory allocation
824 : */
825 2842638 : if (job == NULL || job->pattern == NULL || job->substitute == NULL || NULL == subject)
826 : {
827 0 : return(PCRS_ERR_BADJOB);
828 : }
829 :
830 2842638 : if (NULL == (matches = (pcrs_match *)malloc((size_t)max_matches * sizeof(pcrs_match))))
831 : {
832 0 : return(PCRS_ERR_NOMEM);
833 : }
834 2842638 : memset(matches, '\0', (size_t)max_matches * sizeof(pcrs_match));
835 :
836 :
837 : /*
838 : * Find the pattern and calculate the space
839 : * requirements for the result
840 : */
841 2842638 : newsize = subject_length;
842 :
843 4310692 : while ((submatches = pcre_exec(job->pattern, job->hints, subject, (int)subject_length, offset, 0, offsets, 3 * PCRS_MAX_SUBMATCHES)) > 0)
844 : {
845 1481865 : job->flags |= PCRS_SUCCESS;
846 1481865 : matches[i].submatches = submatches;
847 :
848 2964497 : for (k = 0; k < submatches; k++)
849 : {
850 1482632 : matches[i].submatch_offset[k] = offsets[2 * k];
851 :
852 : /* Note: Non-found optional submatches have length -1-(-1)==0 */
853 1482632 : matches[i].submatch_length[k] = (size_t)(offsets[2 * k + 1] - offsets[2 * k]);
854 :
855 : /* reserve mem for each submatch as often as it is ref'd */
856 1482632 : newsize += matches[i].submatch_length[k] * (size_t)job->substitute->backref_count[k];
857 : }
858 : /* plus replacement text size minus match text size */
859 1481865 : newsize += job->substitute->length - matches[i].submatch_length[0];
860 :
861 : /* chunk before match */
862 1481865 : matches[i].submatch_offset[PCRS_MAX_SUBMATCHES] = 0;
863 1481865 : matches[i].submatch_length[PCRS_MAX_SUBMATCHES] = (size_t)offsets[0];
864 1481865 : newsize += (size_t)offsets[0] * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES];
865 :
866 : /* chunk after match */
867 1481865 : matches[i].submatch_offset[PCRS_MAX_SUBMATCHES + 1] = offsets[1];
868 1481865 : matches[i].submatch_length[PCRS_MAX_SUBMATCHES + 1] = subject_length - (size_t)offsets[1] - 1;
869 1481865 : newsize += (subject_length - (size_t)offsets[1]) * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES + 1];
870 :
871 : /* Storage for matches exhausted? -> Extend! */
872 1481865 : if (++i >= max_matches)
873 : {
874 26 : max_matches = (int)(max_matches * PCRS_MAX_MATCH_GROW);
875 26 : if (NULL == (dummy = (pcrs_match *)realloc(matches, (size_t)max_matches * sizeof(pcrs_match))))
876 : {
877 0 : free(matches);
878 0 : return(PCRS_ERR_NOMEM);
879 : }
880 26 : matches = dummy;
881 : }
882 :
883 : /* Non-global search or limit reached? */
884 1481865 : if (!(job->flags & PCRS_GLOBAL)) break;
885 :
886 : /* Don't loop on empty matches */
887 1468054 : if (offsets[1] == offset)
888 0 : if ((size_t)offset < subject_length)
889 0 : offset++;
890 : else
891 0 : break;
892 : /* Go find the next one */
893 : else
894 1468054 : offset = offsets[1];
895 : }
896 : /* Pass pcre error through if (bad) failure */
897 2842638 : if (submatches < PCRE_ERROR_NOMATCH)
898 : {
899 1 : free(matches);
900 1 : return submatches;
901 : }
902 2842637 : matches_found = i;
903 :
904 :
905 : /*
906 : * Get memory for the result (must be freed by caller!)
907 : * and append terminating null byte.
908 : */
909 2842637 : if ((*result = (char *)malloc(newsize + 1)) == NULL)
910 : {
911 0 : free(matches);
912 0 : return PCRS_ERR_NOMEM;
913 : }
914 : else
915 : {
916 2842637 : (*result)[newsize] = '\0';
917 : }
918 :
919 :
920 : /*
921 : * Replace
922 : */
923 2842637 : offset = 0;
924 2842637 : result_offset = *result;
925 :
926 4324502 : for (i = 0; i < matches_found; i++)
927 : {
928 : /* copy the chunk preceding the match */
929 1481865 : memcpy(result_offset, subject + offset, (size_t)(matches[i].submatch_offset[0] - offset));
930 1481865 : result_offset += matches[i].submatch_offset[0] - offset;
931 :
932 : /* For every segment of the substitute.. */
933 2975016 : for (k = 0; k <= job->substitute->backrefs; k++)
934 : {
935 : /* ...copy its text.. */
936 1493151 : memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]);
937 1493151 : result_offset += job->substitute->block_length[k];
938 :
939 : /* ..plus, if it's not the last chunk, i.e.: There *is* a backref.. */
940 1493151 : if (k != job->substitute->backrefs
941 : /* ..in legal range.. */
942 11286 : && job->substitute->backref[k] < PCRS_MAX_SUBMATCHES + 2
943 : /* ..and referencing a real submatch.. */
944 11286 : && job->substitute->backref[k] < matches[i].submatches
945 : /* ..that is nonempty.. */
946 11286 : && matches[i].submatch_length[job->substitute->backref[k]] > 0)
947 : {
948 : /* ..copy the submatch that is ref'd. */
949 11267 : memcpy(
950 : result_offset,
951 11267 : subject + matches[i].submatch_offset[job->substitute->backref[k]],
952 11267 : matches[i].submatch_length[job->substitute->backref[k]]
953 : );
954 11267 : result_offset += matches[i].submatch_length[job->substitute->backref[k]];
955 : }
956 : }
957 1481865 : offset = matches[i].submatch_offset[0] + (int)matches[i].submatch_length[0];
958 : }
959 :
960 : /* Copy the rest. */
961 2842637 : memcpy(result_offset, subject + offset, subject_length - (size_t)offset);
962 :
963 2842637 : *result_length = newsize;
964 2842637 : free(matches);
965 2842637 : return matches_found;
966 :
967 : }
968 :
969 :
970 : #define is_hex_digit(x) ((x) && strchr("0123456789ABCDEF", toupper(x)))
971 :
972 : /*********************************************************************
973 : *
974 : * Function : is_hex_sequence
975 : *
976 : * Description : Checks the first four characters of a string
977 : * and decides if they are a valid hex sequence
978 : * (like '\x40').
979 : *
980 : * Parameters :
981 : * 1 : sequence = The string to check
982 : *
983 : * Returns : Non-zero if it's valid sequence, or
984 : * Zero if it isn't.
985 : *
986 : *********************************************************************/
987 0 : static int is_hex_sequence(const char *sequence)
988 : {
989 0 : return (sequence[0] == '\\' &&
990 0 : sequence[1] == 'x' &&
991 0 : is_hex_digit(sequence[2]) &&
992 0 : is_hex_digit(sequence[3]));
993 : }
994 :
995 :
996 : /*
997 : * Functions below this line are only part of the pcrs version
998 : * included in Privoxy. If you use any of them you should not
999 : * try to dynamically link against external pcrs versions.
1000 : */
1001 :
1002 : /*********************************************************************
1003 : *
1004 : * Function : pcrs_job_is_dynamic
1005 : *
1006 : * Description : Checks if a job has the "D" (dynamic) option set.
1007 : *
1008 : * Parameters :
1009 : * 1 : job = The job to check
1010 : *
1011 : * Returns : TRUE if the job is indeed dynamic, otherwise
1012 : * FALSE
1013 : *
1014 : *********************************************************************/
1015 111528 : int pcrs_job_is_dynamic(char *job)
1016 : {
1017 111528 : const char delimiter = job[1];
1018 111528 : const size_t length = strlen(job);
1019 : char *option;
1020 :
1021 111528 : if (length < 5)
1022 : {
1023 : /*
1024 : * The shortest valid (but useless)
1025 : * dynamic pattern is "s@@@D"
1026 : */
1027 0 : return FALSE;
1028 : }
1029 :
1030 : /*
1031 : * Everything between the last character
1032 : * and the last delimiter is an option ...
1033 : */
1034 535954 : for (option = job + length; *option != delimiter; option--)
1035 : {
1036 424426 : if (*option == 'D')
1037 : {
1038 : /*
1039 : * ... and if said option is 'D' the job is dynamic.
1040 : */
1041 0 : return TRUE;
1042 : }
1043 : }
1044 111528 : return FALSE;
1045 :
1046 : }
1047 :
1048 :
1049 : /*********************************************************************
1050 : *
1051 : * Function : pcrs_get_delimiter
1052 : *
1053 : * Description : Tries to find a character that is safe to
1054 : * be used as a pcrs delimiter for a certain string.
1055 : *
1056 : * Parameters :
1057 : * 1 : string = The string to search in
1058 : *
1059 : * Returns : A safe delimiter if one was found, otherwise '\0'.
1060 : *
1061 : *********************************************************************/
1062 0 : char pcrs_get_delimiter(const char *string)
1063 : {
1064 : /*
1065 : * Some characters that are unlikely to
1066 : * be part of pcrs replacement strings.
1067 : */
1068 : static const char delimiters[] = "><#+*~%^-:;!@";
1069 0 : const char *d = delimiters;
1070 :
1071 : /* Take the first delimiter that isn't part of the string */
1072 0 : while (*d && NULL != strchr(string, *d))
1073 : {
1074 0 : d++;
1075 : }
1076 0 : return *d;
1077 :
1078 : }
1079 :
1080 :
1081 : /*********************************************************************
1082 : *
1083 : * Function : pcrs_execute_single_command
1084 : *
1085 : * Description : Apply single pcrs command to the subject.
1086 : * The subject itself is left untouched, memory for the result
1087 : * is malloc()ed and it is the caller's responsibility to free
1088 : * the result when it's no longer needed.
1089 : *
1090 : * Parameters :
1091 : * 1 : subject = the subject (== original) string
1092 : * 2 : pcrs_command = the pcrs command as string (s@foo@bar@)
1093 : * 3 : hits = int* for returning the number of modifications
1094 : *
1095 : * Returns : NULL in case of errors, otherwise the
1096 : * result of the pcrs command.
1097 : *
1098 : *********************************************************************/
1099 270 : char *pcrs_execute_single_command(const char *subject, const char *pcrs_command, int *hits)
1100 : {
1101 : size_t size;
1102 270 : char *result = NULL;
1103 : pcrs_job *job;
1104 :
1105 270 : assert(subject);
1106 270 : assert(pcrs_command);
1107 :
1108 270 : *hits = 0;
1109 270 : size = strlen(subject);
1110 :
1111 270 : job = pcrs_compile_command(pcrs_command, hits);
1112 270 : if (NULL != job)
1113 : {
1114 270 : *hits = pcrs_execute(job, subject, size, &result, &size);
1115 270 : if (*hits < 0)
1116 : {
1117 0 : freez(result);
1118 : }
1119 270 : pcrs_free_job(job);
1120 : }
1121 270 : return result;
1122 :
1123 : }
1124 :
1125 :
1126 : /*********************************************************************
1127 : *
1128 : * Function : pcrs_compile_dynamic_command
1129 : *
1130 : * Description : Takes a dynamic pcrs command, fills in the
1131 : * values of the variables and compiles it.
1132 : *
1133 : * Parameters :
1134 : * 1 : pcrs_command = The dynamic pcrs command to compile
1135 : * 2 : v = NULL terminated array of variables and their values.
1136 : * 3 : error = pcrs error code
1137 : *
1138 : * Returns : NULL in case of hard errors, otherwise the
1139 : * compiled pcrs job.
1140 : *
1141 : *********************************************************************/
1142 0 : pcrs_job *pcrs_compile_dynamic_command(char *pcrs_command, const struct pcrs_variable v[], int *error)
1143 : {
1144 : char buf[PCRS_BUFFER_SIZE];
1145 0 : const char *original_pcrs_command = pcrs_command;
1146 0 : char *pcrs_command_tmp = NULL;
1147 0 : pcrs_job *job = NULL;
1148 0 : int truncation = 0;
1149 : char d;
1150 : int ret;
1151 :
1152 0 : while ((NULL != v->name) && (NULL != pcrs_command))
1153 : {
1154 0 : assert(NULL != v->value);
1155 :
1156 0 : if (NULL == strstr(pcrs_command, v->name))
1157 : {
1158 : /*
1159 : * Skip the substitution if the variable
1160 : * name isn't part of the pattern.
1161 : */
1162 0 : v++;
1163 0 : continue;
1164 : }
1165 :
1166 : /* Use pcrs to replace the variable with its value. */
1167 0 : d = pcrs_get_delimiter(v->value);
1168 0 : if ('\0' == d)
1169 : {
1170 : /* No proper delimiter found */
1171 0 : *error = PCRS_ERR_CMDSYNTAX;
1172 0 : freez(pcrs_command_tmp);
1173 0 : return NULL;
1174 : }
1175 :
1176 : /*
1177 : * Variable names are supposed to contain alpha
1178 : * numerical characters plus '_' only.
1179 : */
1180 0 : assert(NULL == strchr(v->name, d));
1181 :
1182 0 : ret = snprintf(buf, sizeof(buf), "s%c\\$%s%c%s%cDgT", d, v->name, d, v->value, d);
1183 0 : assert(ret >= 0);
1184 0 : if (ret >= sizeof(buf))
1185 : {
1186 : /*
1187 : * Value didn't completely fit into buffer,
1188 : * overwrite the end of the substitution text
1189 : * with a truncation message and close the pattern
1190 : * properly.
1191 : */
1192 : static const char warning[] = "... [too long, truncated]";
1193 0 : const size_t trailer_size = sizeof(warning) + 4; /* 4 for d + "DgT" */
1194 0 : char *trailer_start = buf + sizeof(buf) - trailer_size;
1195 :
1196 0 : ret = snprintf(trailer_start, trailer_size, "%s%cDgT", warning, d);
1197 0 : assert(ret == trailer_size - 1);
1198 0 : assert(sizeof(buf) == strlen(buf) + 1);
1199 0 : truncation = 1;
1200 : }
1201 :
1202 0 : pcrs_command_tmp = pcrs_execute_single_command(pcrs_command, buf, error);
1203 0 : if (NULL == pcrs_command_tmp)
1204 : {
1205 0 : return NULL;
1206 : }
1207 :
1208 0 : if (pcrs_command != original_pcrs_command)
1209 : {
1210 0 : freez(pcrs_command);
1211 : }
1212 0 : pcrs_command = pcrs_command_tmp;
1213 :
1214 0 : v++;
1215 : }
1216 :
1217 0 : job = pcrs_compile_command(pcrs_command, error);
1218 0 : if (pcrs_command != original_pcrs_command)
1219 : {
1220 0 : freez(pcrs_command);
1221 : }
1222 :
1223 0 : if (truncation)
1224 : {
1225 0 : *error = PCRS_WARN_TRUNCATION;
1226 : }
1227 :
1228 0 : return job;
1229 :
1230 : }
1231 :
1232 :
1233 : /*
1234 : Local Variables:
1235 : tab-width: 3
1236 : end:
1237 : */
|