1 const char pcrs_rcs[] = "$Id: pcrs.c,v 1.1 2001/05/13 21:57:07 administrator Exp $";
3 /*********************************************************************
5 * File : $Source: /home/administrator/cvs/ijb/pcrs.c,v $
7 * Purpose : This is the pre-pre-alpha realease of libpcrs. It is only
8 * published at this (ugly) stage of development, because it is
9 * needed for a new feature in JunkBuster.
11 * Apart from the code being quite a mess, no inconsistencies,
12 * memory leaks or functional bugs **should** be present.
14 * While you ROTFL at the code, you could just as well mail me
15 * (andreas@oesterhelt.org) with advice for improvement.
17 * pcrs is a supplement to the brilliant pcre library by Philip
18 * Hazel (ph10@cam.ac.uk) and adds Perl-style substitution. That
19 * is, it mimics Perl's 's' operator.
21 * Currently, there's no documentation besides comments and the
24 * Copyright : Written and copyright by andreas@oesterhelt.org
29 *********************************************************************/
35 const char pcrs_h_rcs[] = PCRS_H_VERSION;
38 /*********************************************************************
40 * Function : my_strsep
42 * Description : Convenience function. It acts like strsep, except that
43 * it respects quoting of the delimiter character with the
44 * quote character. (And, of course, quoting the quote char
45 * with itself.) Called from `pcrs_make_job'.
48 * 1 : token = current token
49 * 2 : text = string to tokenize
50 * 3 : delimiter = single character deliminter
51 * 4 : quote_char = character to cause quoting
53 * Returns : -1 => failure, else the length of the token found.
54 * In the latter case, *text is the token's start.
56 *********************************************************************/
57 int my_strsep(char *token, char **text, char delimiter, char quote_char)
59 int i, k=0, limit, quoted = FALSE;
61 limit = strlen(*text);
69 for (i=0; i < limit; i++)
71 if (text[0][i] == delimiter && !quoted)
76 else if (text[0][i] == quote_char && !quoted && i+1 < limit && text[0][i+1] == delimiter)
81 token[k++] = text[0][i];
91 /*********************************************************************
93 * Function : pcrs_compile_perl_options
95 * Description : This function parses a string containing the options to
96 * Perl's s/// operator. It returns an integer that is the
97 * pcre equivalent of the symbolic optstring.
98 * Since pcre doesn't know about Perl's 'g' (global) option,
99 * but pcrs needs it, the globalflag integer is set if 'g'
103 * 1 : optstring = string with options in perl syntax
104 * 2 : globalflag = see description
106 * Returns : option integer suitable for pcre
108 *********************************************************************/
109 int pcrs_compile_perl_options(char *optstring, int *globalflag)
113 for (i=0; i < strlen(optstring); i++)
118 case 'g': *globalflag = 1; break;
119 case 'i': rc |= PCRE_CASELESS; break;
120 case 'm': rc |= PCRE_MULTILINE; break;
122 case 's': rc |= PCRE_DOTALL; break;
123 case 'x': rc |= PCRE_EXTENDED; break;
132 /*********************************************************************
134 * Function : pcrs_compile_replacement
136 * Description : This function takes a Perl-style replacement (2nd argument
137 * to the s/// operator and returns a compiled pcrs_substitute,
138 * or NULL if memory allocation for the substitute structure
142 * 1 : replacement = replacement part of s/// operator
144 * 2 : errptr = pointer to an integer in which error
145 * conditions can be returned.
147 * Returns : pcrs_substitute data structure, or NULL if an
148 * error is encountered. In that case, *errptr has
151 *********************************************************************/
152 pcrs_substitute *pcrs_compile_replacement(char *replacement, int *errptr)
154 int length, i, k = 0, l = 0, quoted = 0, idx;
155 char *text, *num_ptr, *numbers = "0123456789";
158 r = (pcrs_substitute *)malloc(sizeof(pcrs_substitute));
159 if (r == NULL) return NULL;
160 memset(r, '\0', sizeof(pcrs_substitute));
162 text = strdup(replacement); /* must be free()d by caller */
165 *errptr = PCRS_ERR_NOMEM;
170 length = strlen(replacement);
172 for (i=0; i < length; i++)
174 /* Backslash treatment */
175 if (replacement[i] == '\\')
179 text[k++] = replacement[i];
189 /* Dollar treatment */
190 if (replacement[i] == '$' && !quoted && i < length - 1)
192 if (strchr("0123456789&", replacement[i + 1]) == NULL)
194 text[k++] = replacement[i];
198 r->block_length[l] = k - r->block_offset[l];
200 if (replacement[i + 1] != '&')
202 while ((num_ptr = strchr(numbers, replacement[++i])) != NULL && i < length)
204 idx = num_ptr - numbers;
205 r->backref[l] = r->backref[l] * 10 + idx;
211 if (r->backref[l] < PCRS_MAX_SUBMATCHES)
212 r->backref_count[r->backref[l]] += 1;
214 r->block_offset[l] = k;
219 /* Plain char treatment */
220 text[k++] = replacement[i];
227 r->block_length[l] = k - r->block_offset[l];
233 /*********************************************************************
235 * Function : pcrs_free_job
237 * Description : Frees the memory used by a pcrs_job struct and its
238 * dependant structures. Returns a pointer to the next
239 * job, if there was any, or NULL otherwise.
242 * 1 : job = pointer to the pcrs_job structure to be freed
244 * Returns : a pointer to the next job, if there was any, or
247 *********************************************************************/
248 pcrs_job *pcrs_free_job(pcrs_job *job)
259 if (job->pattern != NULL) free(job->pattern);
260 if (job->hints != NULL) free(job->hints);
261 if (job->substitute != NULL)
263 if (job->substitute->text != NULL) free(job->substitute->text);
264 free(job->substitute);
273 /*********************************************************************
275 * Function : pcrs_make_job
277 * Description : Main entry point. Takes a string with a Perl-style
278 * s/// command and returns a corresponding pcrs_job,
279 * or NULL if compiling the job fails at any stage.
280 * Diagnostics could obviously be improved.
283 * 1 : command = string with perl-style s/// command
284 * 2 : errptr = pointer to an integer in which error
285 * conditions can be returned.
287 * Returns : a corresponding pcrs_job data structure, or NULL
288 * if an error was encountered. In that case, *errptr
291 *********************************************************************/
292 pcrs_job *pcrs_make_job(char *command, int *errptr)
294 char *dummy, *token, delimiter;
296 int i = 0, globalflag;
299 /* Get and init memory */
300 if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL)
302 *errptr = PCRS_ERR_NOMEM;
305 memset(newjob, '\0', sizeof(pcrs_job));
307 /* Command too short? */
308 if (strlen(command) < 4)
310 *errptr = PCRS_ERR_CMDSYNTAX;
311 pcrs_free_job(newjob);
315 /* Split command into tokens and handle them */
316 delimiter = command[1];
317 token = (char *)malloc(strlen(command)); /* current token */
318 dummy = (char *)malloc(strlen(command)); /* must store pattern, since we can't */
319 /* use it until the options are known */
320 while (my_strsep(token, &command, delimiter, '\\') >= 0)
324 /* We don't care about the command and assume 's' */
330 strcpy(dummy, token);
335 newjob->substitute = pcrs_compile_replacement(token, errptr);
336 if (newjob->substitute == NULL)
338 pcrs_free_job(newjob);
345 newjob->options = pcrs_compile_perl_options(token, &globalflag);
346 newjob->globalflag = globalflag;
349 /* There shouldn't be anything else! */
351 *errptr = PCRS_ERR_CMDSYNTAX;
352 pcrs_free_job(newjob);
359 /* Compile the pattern */
360 newjob->pattern = pcre_compile(dummy, newjob->options, &error, errptr, NULL);
361 if (newjob->pattern == NULL)
363 pcrs_free_job(newjob);
369 * Generate hints. This has little overhead, since the
370 * hints will be NULL for a boring pattern anyway.
372 newjob->hints = pcre_study(newjob->pattern, 0, &error);
375 *errptr = PCRS_ERR_STUDY;
376 pcrs_free_job(newjob);
385 /*********************************************************************
387 * Function : create_pcrs_job
389 * Description : Create a job from all its components, if you don't
390 * have a Perl command to start from. Rather boring.
393 * 1 : pattern = pointer to pcre pattern
394 * 2 : hints = pointer to pcre hints
395 * 3 : options = options in pcre format
396 * 4 : globalflag = flag that indicates if global matching is desired
397 * 5 : substitute = pointer to pcrs_substitute data structure
398 * 2 : errptr = pointer to an integer in which error
399 * conditions can be returned.
401 * Returns : pcrs_job structure, or NULL if an error was encountered.
402 * In that case, *errptr has the reason why.
404 *********************************************************************/
405 pcrs_job *create_pcrs_job(pcre *pattern, pcre_extra *hints, int options, int globalflag, pcrs_substitute *substitute, int *errptr)
409 if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL)
411 *errptr = PCRS_ERR_NOMEM;
414 memset(newjob, '\0', sizeof(pcrs_job));
416 newjob->pattern = pattern;
417 newjob->hints = hints;
418 newjob->options = options;
419 newjob->globalflag = globalflag;
420 newjob->substitute = substitute;
427 /*********************************************************************
429 * Function : pcrs_exec_substitution
431 * Description : Modify the subject by executing the regular substitution
432 * defined by the job. Since the result may be longer than
433 * the subject, its space requirements are precalculated in
434 * the matching phase and new memory is allocated accordingly.
435 * It is the caller's responsibility to free the result when
436 * it's no longer needed.
439 * 1 : job = the pcrs_job to be executed
440 * 2 : subject = the subject (== original) string
441 * 3 : subject_length = the subject's length
442 * 4 : result = char** for returning the result
443 * 5 : result_length = int* for returning the result's length
445 * Returns : the number of substitutions that were made. May be > 1
446 * if job->globalflag was set
448 *********************************************************************/
449 int pcrs_exec_substitution(pcrs_job *job, char *subject, int subject_length, char **result, int *result_length)
451 int offsets[3 * PCRS_MAX_SUBMATCHES],
452 offset = 0, i=0, k, matches_found, newsize, submatches;
453 pcrs_match matches[PCRS_MAX_MATCHES];
456 newsize=subject_length;
459 while ((submatches = pcre_exec(job->pattern, job->hints, subject, subject_length, offset, 0, offsets, 99)) > 0)
461 matches[i].submatches = submatches;
462 for (k=0; k < submatches; k++)
464 matches[i].submatch_offset[k] = offsets[2 * k];
465 matches[i].submatch_length[k] = offsets[2 * k + 1] - offsets[2 * k]; /* Non-found optional submatches have length -1-(-1)==0 */
466 newsize += matches[i].submatch_length[k] * job->substitute->backref_count[k]; /* reserve mem for each submatch as often as it is ref'd */
468 newsize += strlen(job->substitute->text) - matches[i].submatch_length[0]; /* plus replacement text size minus match text size */
470 /* Non-global search or limit reached? */
471 if (++i >= PCRS_MAX_MATCHES || !job->globalflag ) break;
473 /* Don't loop on empty matches */
474 if (offsets[1] == offset)
475 if (offset < subject_length)
479 /* Go find the next one */
483 if (submatches < -1) return submatches; /* Pass pcre error through */
487 if ((*result = (char *)malloc(newsize)) == NULL) /* must be free()d by caller */
489 return PCRS_ERR_NOMEM;
494 result_offset = *result;
496 for (i=0; i < matches_found; i++)
498 memcpy(result_offset, subject + offset, matches[i].submatch_offset[0] - offset); /* copy the chunk preceding the match */
499 result_offset += matches[i].submatch_offset[0] - offset;
501 /* For every segment of the substitute.. */
502 for (k=0; k <= job->substitute->backrefs; k++)
504 /* ...copy its text.. */
505 memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]);
506 result_offset += job->substitute->block_length[k];
508 /* ..plus, if it's not the last chunk (i.e.: There IS a backref).. */
509 if (k != job->substitute->backrefs
510 /* ..and a nonempty match.. */
511 && matches[i].submatch_length[job->substitute->backref[k]] > 0
512 /* ..and in legal range, ... */
513 && job->substitute->backref[k] <= PCRS_MAX_SUBMATCHES)
515 /* copy the submatch that is ref'd. */
518 subject + matches[i].submatch_offset[job->substitute->backref[k]],
519 matches[i].submatch_length[job->substitute->backref[k]]
521 result_offset += matches[i].submatch_length[job->substitute->backref[k]];
524 offset = matches[i].submatch_offset[0] + matches[i].submatch_length[0];
528 memcpy(result_offset, subject + offset, subject_length - offset);
530 *result_length = newsize;
531 return matches_found;