This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #4 Patch #2 continued
[perl5.git] / regexec.c
CommitLineData
a687059c
LW
1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
ae986130 10/* $Header: regexec.c,v 3.0.1.1 89/11/11 04:52:04 lwall Locked $
a687059c
LW
11 *
12 * $Log: regexec.c,v $
ae986130
LW
13 * Revision 3.0.1.1 89/11/11 04:52:04 lwall
14 * patch2: /\b$foo/ didn't work
15 *
a687059c
LW
16 * Revision 3.0 89/10/18 15:22:53 lwall
17 * 3.0 baseline
18 *
19 */
20
21/*
22 * regcomp and regexec -- regsub and regerror are not used in perl
23 *
24 * Copyright (c) 1986 by University of Toronto.
25 * Written by Henry Spencer. Not derived from licensed software.
26 *
27 * Permission is granted to anyone to use this software for any
28 * purpose on any computer system, and to redistribute it freely,
29 * subject to the following restrictions:
30 *
31 * 1. The author is not responsible for the consequences of use of
32 * this software, no matter how awful, even if they arise
33 * from defects in it.
34 *
35 * 2. The origin of this software must not be misrepresented, either
36 * by explicit claim or by omission.
37 *
38 * 3. Altered versions must be plainly marked as such, and must not
39 * be misrepresented as being the original software.
40 *
41 **** Alterations to Henry's code are...
42 ****
43 **** Copyright (c) 1989, Larry Wall
44 ****
45 **** You may distribute under the terms of the GNU General Public License
46 **** as specified in the README file that comes with the perl 3.0 kit.
47 *
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
51 */
52#include "EXTERN.h"
53#include "perl.h"
54#include "regcomp.h"
55
56#ifndef STATIC
57#define STATIC static
58#endif
59
60#ifdef DEBUGGING
61int regnarrate = 0;
62#endif
63
64/*
65 * regexec and friends
66 */
67
68/*
69 * Global work variables for regexec().
70 */
71static char *regprecomp;
72static char *reginput; /* String-input pointer. */
73static char *regbol; /* Beginning of input, for ^ check. */
74static char *regeol; /* End of input, for $ check. */
75static char **regstartp; /* Pointer to startp array. */
76static char **regendp; /* Ditto for endp. */
77static char *reglastparen; /* Similarly for lastparen. */
78static char *regtill;
79
80static char *regmystartp[10]; /* For remembering backreferences. */
81static char *regmyendp[10];
82
83/*
84 * Forwards.
85 */
86STATIC int regtry();
87STATIC int regmatch();
88STATIC int regrepeat();
89
90extern int multiline;
91
92/*
93 - regexec - match a regexp against a string
94 */
95int
96regexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
97register regexp *prog;
98char *stringarg;
99register char *strend; /* pointer to null at end of string */
100char *strbeg; /* real beginning of string */
101int minend; /* end of match must be at least minend after stringarg */
102STR *screamer;
103int safebase; /* no need to remember string in subbase */
104{
105 register char *s;
106 register int i;
107 register char *c;
108 register char *string = stringarg;
109 register int tmp;
110 int minlen = 0; /* must match at least this many chars */
111 int dontbother = 0; /* how many characters not to try at end */
112 int beginning = (string == strbeg); /* is ^ valid at stringarg? */
113
114 /* Be paranoid... */
115 if (prog == NULL || string == NULL) {
116 fatal("NULL regexp parameter");
117 return(0);
118 }
119
120 regprecomp = prog->precomp;
121 /* Check validity of program. */
122 if (UCHARAT(prog->program) != MAGIC) {
123 FAIL("corrupted regexp program");
124 }
125
126 if (prog->do_folding) {
127 safebase = FALSE;
128 i = strend - string;
129 New(1101,c,i+1,char);
130 (void)bcopy(string, c, i+1);
131 string = c;
132 strend = string + i;
133 for (s = string; s < strend; s++)
134 if (isupper(*s))
135 *s = tolower(*s);
136 }
137
138 /* If there is a "must appear" string, look for it. */
139 s = string;
140 if (prog->regmust != Nullstr) {
141 if (beginning && screamer) {
142 if (screamfirst[prog->regmust->str_rare] >= 0)
143 s = screaminstr(screamer,prog->regmust);
144 else
145 s = Nullch;
146 }
147#ifndef lint
148 else
149 s = fbminstr((unsigned char*)s, (unsigned char*)strend,
150 prog->regmust);
151#endif
152 if (!s) {
153 ++prog->regmust->str_u.str_useful; /* hooray */
154 goto phooey; /* not present */
155 }
156 else if (prog->regback >= 0) {
157 s -= prog->regback;
158 if (s < string)
159 s = string;
160 minlen = prog->regback + prog->regmust->str_cur;
161 }
162 else if (--prog->regmust->str_u.str_useful < 0) { /* boo */
163 str_free(prog->regmust);
164 prog->regmust = Nullstr; /* disable regmust */
165 s = string;
166 }
167 else {
168 s = string;
169 minlen = prog->regmust->str_cur;
170 }
171 }
172
173 /* Mark beginning of line for ^ . */
174 if (beginning)
175 regbol = string;
176 else
177 regbol = NULL;
178
179 /* Mark end of line for $ (and such) */
180 regeol = strend;
181
182 /* see how far we have to get to not match where we matched before */
183 regtill = string+minend;
184
185 /* Simplest case: anchored match need be tried only once. */
186 /* [unless multiline is set] */
187 if (prog->reganch) {
188 if (regtry(prog, string))
189 goto got_it;
190 else if (multiline) {
191 if (minlen)
192 dontbother = minlen - 1;
193 strend -= dontbother;
194 /* for multiline we only have to try after newlines */
195 if (s > string)
196 s--;
197 for (; s < strend; s++) {
198 if (*s == '\n') {
199 if (++s < strend && regtry(prog, s))
200 goto got_it;
201 }
202 }
203 }
204 goto phooey;
205 }
206
207 /* Messy cases: unanchored match. */
208 if (prog->regstart) {
209 /* We know what string it must start with. */
210 if (prog->regstart->str_pok == 3) {
211#ifndef lint
212 while ((s = fbminstr((unsigned char*)s,
213 (unsigned char*)strend, prog->regstart)) != NULL)
214#else
215 while (s = Nullch)
216#endif
217 {
218 if (regtry(prog, s))
219 goto got_it;
220 s++;
221 }
222 }
223 else {
224 c = prog->regstart->str_ptr;
225 while ((s = ninstr(s, strend,
226 c, c + prog->regstart->str_cur )) != NULL) {
227 if (regtry(prog, s))
228 goto got_it;
229 s++;
230 }
231 }
232 goto phooey;
233 }
234 if (c = prog->regstclass) {
235 if (minlen)
236 dontbother = minlen - 1;
237 strend -= dontbother; /* don't bother with what can't match */
238 /* We know what class it must start with. */
239 switch (OP(c)) {
240 case ANYOF: case ANYBUT:
241 c = OPERAND(c);
242 while (s < strend) {
243 i = *s;
244 if (!(c[i >> 3] & (1 << (i&7))))
245 if (regtry(prog, s))
246 goto got_it;
247 s++;
248 }
249 break;
250 case BOUND:
251 if (minlen)
252 dontbother++,strend--;
253 if (s != string) {
254 i = s[-1];
255 tmp = (isalpha(i) || isdigit(i) || i == '_');
256 }
257 else
258 tmp = 0; /* assume not alphanumeric */
259 while (s < strend) {
260 i = *s;
261 if (tmp != (isalpha(i) || isdigit(i) || i == '_')) {
262 tmp = !tmp;
263 if (regtry(prog, s))
264 goto got_it;
265 }
266 s++;
267 }
ae986130 268 if ((minlen || tmp) && regtry(prog,s))
a687059c
LW
269 goto got_it;
270 break;
271 case NBOUND:
272 if (minlen)
273 dontbother++,strend--;
274 if (s != string) {
275 i = s[-1];
276 tmp = (isalpha(i) || isdigit(i) || i == '_');
277 }
278 else
279 tmp = 0; /* assume not alphanumeric */
280 while (s < strend) {
281 i = *s;
282 if (tmp != (isalpha(i) || isdigit(i) || i == '_'))
283 tmp = !tmp;
284 else if (regtry(prog, s))
285 goto got_it;
286 s++;
287 }
ae986130 288 if ((minlen || !tmp) && regtry(prog,s))
a687059c
LW
289 goto got_it;
290 break;
291 case ALNUM:
292 while (s < strend) {
293 i = *s;
294 if (isalpha(i) || isdigit(i) || i == '_')
295 if (regtry(prog, s))
296 goto got_it;
297 s++;
298 }
299 break;
300 case NALNUM:
301 while (s < strend) {
302 i = *s;
303 if (!isalpha(i) && !isdigit(i) && i != '_')
304 if (regtry(prog, s))
305 goto got_it;
306 s++;
307 }
308 break;
309 case SPACE:
310 while (s < strend) {
311 if (isspace(*s))
312 if (regtry(prog, s))
313 goto got_it;
314 s++;
315 }
316 break;
317 case NSPACE:
318 while (s < strend) {
319 if (!isspace(*s))
320 if (regtry(prog, s))
321 goto got_it;
322 s++;
323 }
324 break;
325 case DIGIT:
326 while (s < strend) {
327 if (isdigit(*s))
328 if (regtry(prog, s))
329 goto got_it;
330 s++;
331 }
332 break;
333 case NDIGIT:
334 while (s < strend) {
335 if (!isdigit(*s))
336 if (regtry(prog, s))
337 goto got_it;
338 s++;
339 }
340 break;
341 }
342 }
343 else {
344 dontbother = minend;
345 strend -= dontbother;
346 /* We don't know much -- general case. */
347 do {
348 if (regtry(prog, s))
349 goto got_it;
350 } while (s++ < strend);
351 }
352
353 /* Failure. */
354 goto phooey;
355
356 got_it:
357 if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding){
358 strend += dontbother; /* uncheat */
359 if (safebase) /* no need for $digit later */
360 s = strbeg;
361 else if (strbeg != prog->subbase) {
362 i = strend - string + (stringarg - strbeg);
363 s = nsavestr(strbeg,i); /* so $digit will work later */
364 if (prog->subbase)
365 Safefree(prog->subbase);
366 prog->subbase = s;
367 }
368 else
369 s = prog->subbase;
370 s += (stringarg - strbeg);
371 for (i = 0; i <= prog->nparens; i++) {
372 if (prog->endp[i]) {
373 prog->startp[i] = s + (prog->startp[i] - string);
374 prog->endp[i] = s + (prog->endp[i] - string);
375 }
376 }
377 if (prog->do_folding)
378 Safefree(string);
379 }
380 return(1);
381
382 phooey:
383 if (prog->do_folding)
384 Safefree(string);
385 return(0);
386}
387
388/*
389 - regtry - try match at specific point
390 */
391static int /* 0 failure, 1 success */
392regtry(prog, string)
393regexp *prog;
394char *string;
395{
396 register int i;
397 register char **sp;
398 register char **ep;
399
400 reginput = string;
401 regstartp = prog->startp;
402 regendp = prog->endp;
403 reglastparen = &prog->lastparen;
404 prog->lastparen = 0;
405
406 sp = prog->startp;
407 ep = prog->endp;
408 if (prog->nparens) {
409 for (i = NSUBEXP; i > 0; i--) {
410 *sp++ = NULL;
411 *ep++ = NULL;
412 }
413 }
414 if (regmatch(prog->program + 1) && reginput >= regtill) {
415 prog->startp[0] = string;
416 prog->endp[0] = reginput;
417 return(1);
418 } else
419 return(0);
420}
421
422/*
423 - regmatch - main matching routine
424 *
425 * Conceptually the strategy is simple: check to see whether the current
426 * node matches, call self recursively to see whether the rest matches,
427 * and then act accordingly. In practice we make some effort to avoid
428 * recursion, in particular by going through "ordinary" nodes (that don't
429 * need to know whether the rest of the match failed) by a loop instead of
430 * by recursion.
431 */
432/* [lwall] I've hoisted the register declarations to the outer block in order to
433 * maybe save a little bit of pushing and popping on the stack. It also takes
434 * advantage of machines that use a register save mask on subroutine entry.
435 */
436static int /* 0 failure, 1 success */
437regmatch(prog)
438char *prog;
439{
440 register char *scan; /* Current node. */
441 char *next; /* Next node. */
442 register int nextchar;
443 register int n; /* no or next */
444 register int ln; /* len or last */
445 register char *s; /* operand or save */
446 register char *locinput = reginput;
447
448 nextchar = *locinput;
449 scan = prog;
450#ifdef DEBUGGING
451 if (scan != NULL && regnarrate)
452 fprintf(stderr, "%s(\n", regprop(scan));
453#endif
454 while (scan != NULL) {
455#ifdef DEBUGGING
456 if (regnarrate)
457 fprintf(stderr, "%s...\n", regprop(scan));
458#endif
459
460#ifdef REGALIGN
461 next = scan + NEXT(scan);
462 if (next == scan)
463 next = NULL;
464#else
465 next = regnext(scan);
466#endif
467
468 switch (OP(scan)) {
469 case BOL:
470 if (locinput == regbol ||
471 ((nextchar || locinput < regeol) &&
472 locinput[-1] == '\n') )
473 {
474 regtill--;
475 break;
476 }
477 return(0);
478 case EOL:
479 if ((nextchar || locinput < regeol) && nextchar != '\n')
480 return(0);
481 regtill--;
482 break;
483 case ANY:
484 if ((nextchar == '\0' && locinput >= regeol) ||
485 nextchar == '\n')
486 return(0);
487 nextchar = *++locinput;
488 break;
489 case EXACTLY:
490 s = OPERAND(scan);
491 ln = *s++;
492 /* Inline the first character, for speed. */
493 if (*s != nextchar)
494 return(0);
495 if (locinput + ln > regeol)
496 return 0;
497 if (ln > 1 && bcmp(s, locinput, ln) != 0)
498 return(0);
499 locinput += ln;
500 nextchar = *locinput;
501 break;
502 case ANYOF:
503 case ANYBUT:
504 s = OPERAND(scan);
505 if (nextchar < 0)
506 nextchar = UCHARAT(locinput);
507 if (s[nextchar >> 3] & (1 << (nextchar&7)))
508 return(0);
509 nextchar = *++locinput;
510 if (!nextchar && locinput > regeol)
511 return 0;
512 break;
513 case ALNUM:
514 if (!nextchar)
515 return(0);
516 if (!isalpha(nextchar) && !isdigit(nextchar) &&
517 nextchar != '_')
518 return(0);
519 nextchar = *++locinput;
520 break;
521 case NALNUM:
522 if (!nextchar && locinput >= regeol)
523 return(0);
524 if (isalpha(nextchar) || isdigit(nextchar) ||
525 nextchar == '_')
526 return(0);
527 nextchar = *++locinput;
528 break;
529 case NBOUND:
530 case BOUND:
531 if (locinput == regbol) /* was last char in word? */
532 ln = 0;
533 else
534 ln = (isalpha(locinput[-1]) ||
535 isdigit(locinput[-1]) ||
536 locinput[-1] == '_' );
537 n = (isalpha(nextchar) || isdigit(nextchar) ||
538 nextchar == '_' ); /* is next char in word? */
539 if ((ln == n) == (OP(scan) == BOUND))
540 return(0);
541 break;
542 case SPACE:
543 if (!nextchar && locinput >= regeol)
544 return(0);
545 if (!isspace(nextchar))
546 return(0);
547 nextchar = *++locinput;
548 break;
549 case NSPACE:
550 if (!nextchar)
551 return(0);
552 if (isspace(nextchar))
553 return(0);
554 nextchar = *++locinput;
555 break;
556 case DIGIT:
557 if (!isdigit(nextchar))
558 return(0);
559 nextchar = *++locinput;
560 break;
561 case NDIGIT:
562 if (!nextchar && locinput >= regeol)
563 return(0);
564 if (isdigit(nextchar))
565 return(0);
566 nextchar = *++locinput;
567 break;
568 case REF:
569 case REF+1:
570 case REF+2:
571 case REF+3:
572 case REF+4:
573 case REF+5:
574 case REF+6:
575 case REF+7:
576 case REF+8:
577 case REF+9:
578 n = OP(scan) - REF;
579 s = regmystartp[n];
580 if (!s)
581 return(0);
582 if (!regmyendp[n])
583 return(0);
584 if (s == regmyendp[n])
585 break;
586 /* Inline the first character, for speed. */
587 if (*s != nextchar)
588 return(0);
589 ln = regmyendp[n] - s;
590 if (locinput + ln > regeol)
591 return 0;
592 if (ln > 1 && bcmp(s, locinput, ln) != 0)
593 return(0);
594 locinput += ln;
595 nextchar = *locinput;
596 break;
597
598 case NOTHING:
599 break;
600 case BACK:
601 break;
602 case OPEN+1:
603 case OPEN+2:
604 case OPEN+3:
605 case OPEN+4:
606 case OPEN+5:
607 case OPEN+6:
608 case OPEN+7:
609 case OPEN+8:
610 case OPEN+9:
611 n = OP(scan) - OPEN;
612 reginput = locinput;
613
614 regmystartp[n] = locinput; /* for REF */
615 if (regmatch(next)) {
616 /*
617 * Don't set startp if some later
618 * invocation of the same parentheses
619 * already has.
620 */
621 if (regstartp[n] == NULL)
622 regstartp[n] = locinput;
623 return(1);
624 } else
625 return(0);
626 /* NOTREACHED */
627 case CLOSE+1:
628 case CLOSE+2:
629 case CLOSE+3:
630 case CLOSE+4:
631 case CLOSE+5:
632 case CLOSE+6:
633 case CLOSE+7:
634 case CLOSE+8:
635 case CLOSE+9: {
636 n = OP(scan) - CLOSE;
637 reginput = locinput;
638
639 regmyendp[n] = locinput; /* for REF */
640 if (regmatch(next)) {
641 /*
642 * Don't set endp if some later
643 * invocation of the same parentheses
644 * already has.
645 */
646 if (regendp[n] == NULL) {
647 regendp[n] = locinput;
648 if (n > *reglastparen)
649 *reglastparen = n;
650 }
651 return(1);
652 } else
653 return(0);
654 }
655 /*NOTREACHED*/
656 case BRANCH: {
657 if (OP(next) != BRANCH) /* No choice. */
658 next = NEXTOPER(scan); /* Avoid recursion. */
659 else {
660 do {
661 reginput = locinput;
662 if (regmatch(NEXTOPER(scan)))
663 return(1);
664#ifdef REGALIGN
665 if (n = NEXT(scan))
666 scan += n;
667 else
668 scan = NULL;
669#else
670 scan = regnext(scan);
671#endif
672 } while (scan != NULL && OP(scan) == BRANCH);
673 return(0);
674 /* NOTREACHED */
675 }
676 }
677 break;
678 case STAR:
679 case PLUS:
680 /*
681 * Lookahead to avoid useless match attempts
682 * when we know what character comes next.
683 */
684 if (OP(next) == EXACTLY)
685 nextchar = *(OPERAND(next)+1);
686 else
687 nextchar = -1000;
688 ln = (OP(scan) == STAR) ? 0 : 1;
689 reginput = locinput;
690 n = regrepeat(NEXTOPER(scan));
691 while (n >= ln) {
692 /* If it could work, try it. */
693 if (nextchar == -1000 || *reginput == nextchar)
694 if (regmatch(next))
695 return(1);
696 /* Couldn't or didn't -- back up. */
697 n--;
698 reginput = locinput + n;
699 }
700 return(0);
701 case END:
702 reginput = locinput; /* put where regtry can find it */
703 return(1); /* Success! */
704 default:
705 printf("%x %d\n",scan,scan[1]);
706 FAIL("regexp memory corruption");
707 }
708
709 scan = next;
710 }
711
712 /*
713 * We get here only if there's trouble -- normally "case END" is
714 * the terminating point.
715 */
716 FAIL("corrupted regexp pointers");
717 /*NOTREACHED*/
718#ifdef lint
719 return 0;
720#endif
721}
722
723/*
724 - regrepeat - repeatedly match something simple, report how many
725 */
726/*
727 * [This routine now assumes that it will only match on things of length 1.
728 * That was true before, but now we assume scan - reginput is the count,
729 * rather than incrementing count on every character.]
730 */
731static int
732regrepeat(p)
733char *p;
734{
735 register char *scan;
736 register char *opnd;
737 register int c;
738 register char *loceol = regeol;
739
740 scan = reginput;
741 opnd = OPERAND(p);
742 switch (OP(p)) {
743 case ANY:
744 while (scan < loceol && *scan != '\n')
745 scan++;
746 break;
747 case EXACTLY: /* length of string is 1 */
748 opnd++;
749 while (scan < loceol && *opnd == *scan)
750 scan++;
751 break;
752 case ANYOF:
753 case ANYBUT:
754 c = UCHARAT(scan);
755 while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
756 scan++;
757 c = UCHARAT(scan);
758 }
759 break;
760 case ALNUM:
761 while (isalpha(*scan) || isdigit(*scan) || *scan == '_')
762 scan++;
763 break;
764 case NALNUM:
765 while (scan < loceol && (!isalpha(*scan) && !isdigit(*scan) &&
766 *scan != '_'))
767 scan++;
768 break;
769 case SPACE:
770 while (scan < loceol && isspace(*scan))
771 scan++;
772 break;
773 case NSPACE:
774 while (scan < loceol && !isspace(*scan))
775 scan++;
776 break;
777 case DIGIT:
778 while (isdigit(*scan))
779 scan++;
780 break;
781 case NDIGIT:
782 while (scan < loceol && !isdigit(*scan))
783 scan++;
784 break;
785 default: /* Oh dear. Called inappropriately. */
786 FAIL("internal regexp foulup");
787 /* NOTREACHED */
788 }
789
790 c = scan - reginput;
791 reginput = scan;
792
793 return(c);
794}
795
796/*
797 - regnext - dig the "next" pointer out of a node
798 *
799 * [Note, when REGALIGN is defined there are two places in regmatch()
800 * that bypass this code for speed.]
801 */
802char *
803regnext(p)
804register char *p;
805{
806 register int offset;
807
808 if (p == &regdummy)
809 return(NULL);
810
811 offset = NEXT(p);
812 if (offset == 0)
813 return(NULL);
814
815#ifdef REGALIGN
816 return(p+offset);
817#else
818 if (OP(p) == BACK)
819 return(p-offset);
820 else
821 return(p+offset);
822#endif
823}