This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
This is my patch patch.1n for perl5.001.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
f0fcb552 22/*SUPPRESS 112*/
a687059c 23/*
e50aee73 24 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
25 *
26 * Copyright (c) 1986 by University of Toronto.
27 * Written by Henry Spencer. Not derived from licensed software.
28 *
29 * Permission is granted to anyone to use this software for any
30 * purpose on any computer system, and to redistribute it freely,
31 * subject to the following restrictions:
32 *
33 * 1. The author is not responsible for the consequences of use of
34 * this software, no matter how awful, even if they arise
35 * from defects in it.
36 *
37 * 2. The origin of this software must not be misrepresented, either
38 * by explicit claim or by omission.
39 *
40 * 3. Altered versions must be plainly marked as such, and must not
41 * be misrepresented as being the original software.
42 *
43 **** Alterations to Henry's code are...
44 ****
a0d0e21e 45 **** Copyright (c) 1991-1994, Larry Wall
a687059c 46 ****
9ef589d8
LW
47 **** You may distribute under the terms of either the GNU General Public
48 **** License or the Artistic License, as specified in the README file.
a687059c
LW
49 *
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
53 */
54#include "EXTERN.h"
55#include "perl.h"
56#include "regcomp.h"
57
58#ifndef STATIC
59#define STATIC static
60#endif
61
62#ifdef DEBUGGING
a0d0e21e
LW
63static I32 regnarrate = 0;
64static char* regprogram = 0;
a687059c
LW
65#endif
66
a0d0e21e
LW
67/* Current curly descriptor */
68typedef struct curcur CURCUR;
69struct curcur {
70 int parenfloor; /* how far back to strip paren data */
71 int cur; /* how many instances of scan we've matched */
72 int min; /* the minimal number of scans to match */
73 int max; /* the maximal number of scans to match */
74 int minmod; /* whether to work our way up or down */
75 char * scan; /* the thing to match */
76 char * next; /* what has to match after it */
77 char * lastloc; /* where we started matching this scan */
78 CURCUR * oldcc; /* current curly before we started this one */
79};
80
81static CURCUR* regcc;
82
83typedef I32 CHECKPOINT;
84
85CHECKPOINT regcppush _((I32 parenfloor));
86char * regcppop _((void));
87
88CHECKPOINT
89regcppush(parenfloor)
90I32 parenfloor;
91{
92 int retval = savestack_ix;
93 int i = (regsize - parenfloor) * 3;
94 int p;
95
96 SSCHECK(i + 5);
97 for (p = regsize; p > parenfloor; p--) {
98 SSPUSHPTR(regendp[p]);
99 SSPUSHPTR(regstartp[p]);
100 SSPUSHINT(p);
101 }
102 SSPUSHINT(regsize);
103 SSPUSHINT(*reglastparen);
104 SSPUSHPTR(reginput);
105 SSPUSHINT(i + 3);
106 SSPUSHINT(SAVEt_REGCONTEXT);
107 return retval;
108}
109
110char*
111regcppop()
112{
113 I32 i = SSPOPINT;
114 U32 paren = 0;
115 char *input;
116 char *tmps;
117 assert(i == SAVEt_REGCONTEXT);
118 i = SSPOPINT;
119 input = (char *) SSPOPPTR;
120 *reglastparen = SSPOPINT;
121 regsize = SSPOPINT;
122 for (i -= 3; i > 0; i -= 3) {
123 paren = (U32)SSPOPINT;
124 regstartp[paren] = (char *) SSPOPPTR;
125 tmps = (char*)SSPOPPTR;
126 if (paren <= *reglastparen)
127 regendp[paren] = tmps;
128 }
129 for (paren = *reglastparen + 1; paren <= regnpar; paren++) {
130 if (paren > regsize)
131 regstartp[paren] = Nullch;
132 regendp[paren] = Nullch;
133 }
134 return input;
135}
136
137#define regcpblow(cp) leave_scope(cp)
138
a687059c 139/*
e50aee73 140 * pregexec and friends
a687059c
LW
141 */
142
143/*
a687059c
LW
144 * Forwards.
145 */
a0d0e21e
LW
146
147static I32 regmatch _((char *prog));
148static I32 regrepeat _((char *p, I32 max));
149static I32 regtry _((regexp *prog, char *startpos));
a687059c
LW
150
151/*
e50aee73 152 - pregexec - match a regexp against a string
a687059c 153 */
79072805 154I32
e50aee73 155pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
a687059c
LW
156register regexp *prog;
157char *stringarg;
158register char *strend; /* pointer to null at end of string */
159char *strbeg; /* real beginning of string */
79072805
LW
160I32 minend; /* end of match must be at least minend after stringarg */
161SV *screamer;
162I32 safebase; /* no need to remember string in subbase */
a687059c 163{
a0d0e21e
LW
164 register char *s;
165 register I32 i;
166 register char *c;
167 register char *startpos = stringarg;
168 register I32 tmp;
169 I32 minlen = 0; /* must match at least this many chars */
170 I32 dontbother = 0; /* how many characters not to try at end */
171 CURCUR cc;
a687059c 172
a0d0e21e
LW
173 cc.cur = 0;
174 regcc = &cc;
175
176#ifdef DEBUGGING
177 regnarrate = debug & 512;
178 regprogram = prog->program;
179#endif
180
181 /* Be paranoid... */
182 if (prog == NULL || startpos == NULL) {
183 croak("NULL regexp parameter");
184 return 0;
185 }
186
187 if (startpos == strbeg) /* is ^ valid at stringarg? */
188 regprev = '\n';
189 else {
190 regprev = stringarg[-1];
191 if (!multiline && regprev == '\n')
192 regprev = '\0'; /* force ^ to NOT match */
193 }
194 regprecomp = prog->precomp;
195 regnpar = prog->nparens;
196 /* Check validity of program. */
197 if (UCHARAT(prog->program) != MAGIC) {
198 FAIL("corrupted regexp program");
199 }
200
201 if (prog->do_folding) {
202 i = strend - startpos;
203 New(1101,c,i+1,char);
204 Copy(startpos, c, i+1, char);
205 startpos = c;
206 strend = startpos + i;
207 for (s = startpos; s < strend; s++)
208 if (isUPPER(*s))
209 *s = toLOWER(*s);
210 }
211
212 /* If there is a "must appear" string, look for it. */
213 s = startpos;
214 if (prog->regmust != Nullsv &&
215 (!(prog->reganch & ROPT_ANCH)
216 || (multiline && prog->regback >= 0)) )
217 {
218 if (stringarg == strbeg && screamer) {
219 if (screamfirst[BmRARE(prog->regmust)] >= 0)
220 s = screaminstr(screamer,prog->regmust);
221 else
222 s = Nullch;
0a12ae7d 223 }
a0d0e21e
LW
224 else
225 s = fbm_instr((unsigned char*)s, (unsigned char*)strend,
226 prog->regmust);
227 if (!s) {
228 ++BmUSEFUL(prog->regmust); /* hooray */
229 goto phooey; /* not present */
a687059c 230 }
a0d0e21e
LW
231 else if (prog->regback >= 0) {
232 s -= prog->regback;
233 if (s < startpos)
234 s = startpos;
235 minlen = prog->regback + SvCUR(prog->regmust);
a687059c 236 }
a0d0e21e
LW
237 else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */
238 SvREFCNT_dec(prog->regmust);
239 prog->regmust = Nullsv; /* disable regmust */
240 s = startpos;
241 }
242 else {
243 s = startpos;
244 minlen = SvCUR(prog->regmust);
a687059c 245 }
a0d0e21e 246 }
a687059c 247
a0d0e21e
LW
248 /* Mark beginning of line for ^ . */
249 regbol = startpos;
a687059c 250
a0d0e21e
LW
251 /* Mark end of line for $ (and such) */
252 regeol = strend;
a687059c 253
a0d0e21e
LW
254 /* see how far we have to get to not match where we matched before */
255 regtill = startpos+minend;
a687059c 256
a0d0e21e
LW
257 /* Simplest case: anchored match need be tried only once. */
258 /* [unless multiline is set] */
259 if (prog->reganch & ROPT_ANCH) {
260 if (regtry(prog, startpos))
261 goto got_it;
262 else if (multiline || (prog->reganch & ROPT_IMPLICIT)) {
263 if (minlen)
264 dontbother = minlen - 1;
265 strend -= dontbother;
266 /* for multiline we only have to try after newlines */
267 if (s > startpos)
268 s--;
269 while (s < strend) {
270 if (*s++ == '\n') {
271 if (s < strend && regtry(prog, s))
272 goto got_it;
273 }
35c8bce7 274 }
35c8bce7 275 }
a0d0e21e
LW
276 goto phooey;
277 }
35c8bce7 278
a0d0e21e
LW
279 /* Messy cases: unanchored match. */
280 if (prog->regstart) {
281 if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
282 /* it must be a one character string */
283 i = SvPVX(prog->regstart)[0];
284 while (s < strend) {
285 if (*s == i) {
286 if (regtry(prog, s))
a687059c 287 goto got_it;
a0d0e21e
LW
288 s++;
289 while (s < strend && *s == i)
290 s++;
a687059c 291 }
a0d0e21e
LW
292 s++;
293 }
a687059c 294 }
a0d0e21e
LW
295 else if (SvPOK(prog->regstart) == 3) {
296 /* We know what string it must start with. */
297 while ((s = fbm_instr((unsigned char*)s,
298 (unsigned char*)strend, prog->regstart)) != NULL)
299 {
300 if (regtry(prog, s))
301 goto got_it;
302 s++;
303 }
304 }
305 else {
306 c = SvPVX(prog->regstart);
307 while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL)
308 {
309 if (regtry(prog, s))
310 goto got_it;
311 s++;
312 }
313 }
314 goto phooey;
315 }
316 /*SUPPRESS 560*/
317 if (c = prog->regstclass) {
318 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
a687059c 319
a0d0e21e
LW
320 if (minlen)
321 dontbother = minlen - 1;
322 strend -= dontbother; /* don't bother with what can't match */
323 tmp = 1;
324 /* We know what class it must start with. */
325 switch (OP(c)) {
326 case ANYOF:
327 c = OPERAND(c);
328 while (s < strend) {
329 i = UCHARAT(s);
330 if (!(c[i >> 3] & (1 << (i&7)))) {
331 if (tmp && regtry(prog, s))
332 goto got_it;
333 else
334 tmp = doevery;
a687059c 335 }
a0d0e21e
LW
336 else
337 tmp = 1;
338 s++;
339 }
340 break;
341 case BOUND:
342 if (minlen)
343 dontbother++,strend--;
344 if (s != startpos) {
345 i = s[-1];
346 tmp = isALNUM(i);
347 }
348 else
349 tmp = isALNUM(regprev); /* assume not alphanumeric */
350 while (s < strend) {
351 i = *s;
352 if (tmp != isALNUM(i)) {
353 tmp = !tmp;
354 if (regtry(prog, s))
355 goto got_it;
a687059c 356 }
a0d0e21e
LW
357 s++;
358 }
359 if ((minlen || tmp) && regtry(prog,s))
360 goto got_it;
361 break;
362 case NBOUND:
363 if (minlen)
364 dontbother++,strend--;
365 if (s != startpos) {
366 i = s[-1];
367 tmp = isALNUM(i);
368 }
369 else
370 tmp = isALNUM(regprev); /* assume not alphanumeric */
371 while (s < strend) {
372 i = *s;
373 if (tmp != isALNUM(i))
374 tmp = !tmp;
375 else if (regtry(prog, s))
376 goto got_it;
377 s++;
378 }
379 if ((minlen || !tmp) && regtry(prog,s))
380 goto got_it;
381 break;
382 case ALNUM:
383 while (s < strend) {
384 i = *s;
385 if (isALNUM(i)) {
386 if (tmp && regtry(prog, s))
387 goto got_it;
a687059c 388 else
a0d0e21e
LW
389 tmp = doevery;
390 }
391 else
392 tmp = 1;
393 s++;
394 }
395 break;
396 case NALNUM:
397 while (s < strend) {
398 i = *s;
399 if (!isALNUM(i)) {
400 if (tmp && regtry(prog, s))
401 goto got_it;
a687059c 402 else
a0d0e21e 403 tmp = doevery;
a687059c 404 }
a0d0e21e
LW
405 else
406 tmp = 1;
407 s++;
408 }
409 break;
410 case SPACE:
411 while (s < strend) {
412 if (isSPACE(*s)) {
413 if (tmp && regtry(prog, s))
414 goto got_it;
415 else
416 tmp = doevery;
2304df62 417 }
a0d0e21e
LW
418 else
419 tmp = 1;
420 s++;
421 }
422 break;
423 case NSPACE:
424 while (s < strend) {
425 if (!isSPACE(*s)) {
426 if (tmp && regtry(prog, s))
427 goto got_it;
428 else
429 tmp = doevery;
a687059c 430 }
a0d0e21e
LW
431 else
432 tmp = 1;
433 s++;
434 }
435 break;
436 case DIGIT:
437 while (s < strend) {
438 if (isDIGIT(*s)) {
439 if (tmp && regtry(prog, s))
440 goto got_it;
441 else
442 tmp = doevery;
2b69d0c2 443 }
a0d0e21e
LW
444 else
445 tmp = 1;
446 s++;
447 }
448 break;
449 case NDIGIT:
450 while (s < strend) {
451 if (!isDIGIT(*s)) {
452 if (tmp && regtry(prog, s))
453 goto got_it;
454 else
455 tmp = doevery;
a687059c 456 }
a0d0e21e
LW
457 else
458 tmp = 1;
459 s++;
460 }
461 break;
a687059c 462 }
a0d0e21e
LW
463 }
464 else {
465 if (minlen)
466 dontbother = minlen - 1;
467 strend -= dontbother;
468 /* We don't know much -- general case. */
469 do {
470 if (regtry(prog, s))
471 goto got_it;
472 } while (s++ < strend);
473 }
474
475 /* Failure. */
476 goto phooey;
a687059c 477
a0d0e21e 478got_it:
420218e7 479 strend += dontbother; /* uncheat */
a0d0e21e
LW
480 prog->subbeg = strbeg;
481 prog->subend = strend;
482 if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) {
a0d0e21e
LW
483 i = strend - startpos + (stringarg - strbeg);
484 if (safebase) { /* no need for $digit later */
485 s = strbeg;
486 prog->subend = s+i;
487 }
488 else if (strbeg != prog->subbase) {
489 s = savepvn(strbeg,i); /* so $digit will work later */
490 if (prog->subbase)
491 Safefree(prog->subbase);
492 prog->subbeg = prog->subbase = s;
493 prog->subend = s+i;
494 }
495 else {
496 prog->subbeg = s = prog->subbase;
497 prog->subend = s+i;
498 }
499 s += (stringarg - strbeg);
500 for (i = 0; i <= prog->nparens; i++) {
501 if (prog->endp[i]) {
502 prog->startp[i] = s + (prog->startp[i] - startpos);
503 prog->endp[i] = s + (prog->endp[i] - startpos);
504 }
505 }
a687059c 506 if (prog->do_folding)
a0d0e21e
LW
507 Safefree(startpos);
508 }
509 return 1;
510
511phooey:
512 if (prog->do_folding)
513 Safefree(startpos);
514 return 0;
a687059c
LW
515}
516
517/*
518 - regtry - try match at specific point
519 */
79072805 520static I32 /* 0 failure, 1 success */
a0d0e21e 521regtry(prog, startpos)
a687059c 522regexp *prog;
a0d0e21e 523char *startpos;
a687059c 524{
a0d0e21e
LW
525 register I32 i;
526 register char **sp;
527 register char **ep;
528
529 reginput = startpos;
530 regstartp = prog->startp;
531 regendp = prog->endp;
532 reglastparen = &prog->lastparen;
533 prog->lastparen = 0;
534 regsize = 0;
535
536 sp = prog->startp;
537 ep = prog->endp;
538 if (prog->nparens) {
539 for (i = prog->nparens; i >= 0; i--) {
540 *sp++ = NULL;
541 *ep++ = NULL;
a687059c 542 }
a0d0e21e
LW
543 }
544 if (regmatch(prog->program + 1) && reginput >= regtill) {
545 prog->startp[0] = startpos;
546 prog->endp[0] = reginput;
547 return 1;
548 }
549 else
550 return 0;
a687059c
LW
551}
552
553/*
554 - regmatch - main matching routine
555 *
556 * Conceptually the strategy is simple: check to see whether the current
557 * node matches, call self recursively to see whether the rest matches,
558 * and then act accordingly. In practice we make some effort to avoid
559 * recursion, in particular by going through "ordinary" nodes (that don't
560 * need to know whether the rest of the match failed) by a loop instead of
561 * by recursion.
562 */
563/* [lwall] I've hoisted the register declarations to the outer block in order to
564 * maybe save a little bit of pushing and popping on the stack. It also takes
565 * advantage of machines that use a register save mask on subroutine entry.
566 */
79072805 567static I32 /* 0 failure, 1 success */
a687059c
LW
568regmatch(prog)
569char *prog;
570{
a0d0e21e
LW
571 register char *scan; /* Current node. */
572 char *next; /* Next node. */
573 register I32 nextchar;
574 register I32 n; /* no or next */
575 register I32 ln; /* len or last */
576 register char *s; /* operand or save */
577 register char *locinput = reginput;
578 int minmod = 0;
579
580 nextchar = *locinput;
581 scan = prog;
582 while (scan != NULL) {
a687059c 583#ifdef DEBUGGING
a0d0e21e
LW
584 if (regnarrate)
585 fprintf(stderr, "%2d%-8.8s\t<%.10s>\n",
586 scan - regprogram, regprop(scan), locinput);
a687059c
LW
587#endif
588
589#ifdef REGALIGN
a0d0e21e
LW
590 next = scan + NEXT(scan);
591 if (next == scan)
592 next = NULL;
a687059c 593#else
a0d0e21e 594 next = regnext(scan);
a687059c
LW
595#endif
596
a0d0e21e
LW
597 switch (OP(scan)) {
598 case BOL:
599 if (locinput == regbol
600 ? regprev == '\n'
601 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
602 {
603 /* regtill = regbol; */
604 break;
605 }
606 return 0;
607 case MBOL:
608 if (locinput == regbol
609 ? regprev == '\n'
610 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
611 {
612 break;
613 }
614 return 0;
615 case SBOL:
616 if (locinput == regbol && regprev == '\n')
617 break;
618 return 0;
619 case GBOL:
620 if (locinput == regbol)
621 break;
622 return 0;
623 case EOL:
624 if (multiline)
625 goto meol;
626 else
627 goto seol;
628 case MEOL:
629 meol:
630 if ((nextchar || locinput < regeol) && nextchar != '\n')
631 return 0;
632 break;
633 case SEOL:
634 seol:
635 if ((nextchar || locinput < regeol) && nextchar != '\n')
636 return 0;
637 if (regeol - locinput > 1)
638 return 0;
639 break;
640 case SANY:
641 if (!nextchar && locinput >= regeol)
642 return 0;
643 nextchar = *++locinput;
644 break;
645 case ANY:
646 if (!nextchar && locinput >= regeol || nextchar == '\n')
647 return 0;
648 nextchar = *++locinput;
649 break;
650 case EXACTLY:
651 s = OPERAND(scan);
652 ln = *s++;
653 /* Inline the first character, for speed. */
654 if (*s != nextchar)
655 return 0;
656 if (regeol - locinput < ln)
657 return 0;
658 if (ln > 1 && bcmp(s, locinput, ln) != 0)
659 return 0;
660 locinput += ln;
661 nextchar = *locinput;
662 break;
663 case ANYOF:
664 s = OPERAND(scan);
665 if (nextchar < 0)
666 nextchar = UCHARAT(locinput);
667 if (s[nextchar >> 3] & (1 << (nextchar&7)))
668 return 0;
669 if (!nextchar && locinput >= regeol)
670 return 0;
671 nextchar = *++locinput;
672 break;
673 case ALNUM:
674 if (!nextchar)
675 return 0;
676 if (!isALNUM(nextchar))
677 return 0;
678 nextchar = *++locinput;
679 break;
680 case NALNUM:
681 if (!nextchar && locinput >= regeol)
682 return 0;
683 if (isALNUM(nextchar))
684 return 0;
685 nextchar = *++locinput;
686 break;
687 case NBOUND:
688 case BOUND:
689 if (locinput == regbol) /* was last char in word? */
690 ln = isALNUM(regprev);
691 else
692 ln = isALNUM(locinput[-1]);
693 n = isALNUM(nextchar); /* is next char in word? */
694 if ((ln == n) == (OP(scan) == BOUND))
695 return 0;
696 break;
697 case SPACE:
698 if (!nextchar && locinput >= regeol)
699 return 0;
700 if (!isSPACE(nextchar))
701 return 0;
702 nextchar = *++locinput;
703 break;
704 case NSPACE:
705 if (!nextchar)
706 return 0;
707 if (isSPACE(nextchar))
708 return 0;
709 nextchar = *++locinput;
710 break;
711 case DIGIT:
712 if (!isDIGIT(nextchar))
713 return 0;
714 nextchar = *++locinput;
715 break;
716 case NDIGIT:
717 if (!nextchar && locinput >= regeol)
718 return 0;
719 if (isDIGIT(nextchar))
720 return 0;
721 nextchar = *++locinput;
722 break;
723 case REF:
724 n = ARG1(scan); /* which paren pair */
725 s = regstartp[n];
726 if (!s)
727 return 0;
728 if (!regendp[n])
729 return 0;
730 if (s == regendp[n])
731 break;
732 /* Inline the first character, for speed. */
733 if (*s != nextchar)
734 return 0;
735 ln = regendp[n] - s;
736 if (locinput + ln > regeol)
737 return 0;
738 if (ln > 1 && bcmp(s, locinput, ln) != 0)
739 return 0;
740 locinput += ln;
741 nextchar = *locinput;
742 break;
743
744 case NOTHING:
745 break;
746 case BACK:
747 break;
748 case OPEN:
749 n = ARG1(scan); /* which paren pair */
750 regstartp[n] = locinput;
751 if (n > regsize)
752 regsize = n;
753 break;
754 case CLOSE:
755 n = ARG1(scan); /* which paren pair */
756 regendp[n] = locinput;
757 if (n > *reglastparen)
758 *reglastparen = n;
759 break;
760 case CURLYX: {
761 CURCUR cc;
762 CHECKPOINT cp = savestack_ix;
763 cc.oldcc = regcc;
764 regcc = &cc;
765 cc.parenfloor = *reglastparen;
766 cc.cur = -1;
767 cc.min = ARG1(scan);
768 cc.max = ARG2(scan);
769 cc.scan = NEXTOPER(scan) + 4;
770 cc.next = next;
771 cc.minmod = minmod;
772 cc.lastloc = 0;
773 reginput = locinput;
774 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
775 regcpblow(cp);
776 regcc = cc.oldcc;
777 return n;
778 }
779 /* NOT REACHED */
780 case WHILEM: {
781 /*
782 * This is really hard to understand, because after we match
783 * what we're trying to match, we must make sure the rest of
784 * the RE is going to match for sure, and to do that we have
785 * to go back UP the parse tree by recursing ever deeper. And
786 * if it fails, we have to reset our parent's current state
787 * that we can try again after backing off.
788 */
789
790 CURCUR* cc = regcc;
791 n = cc->cur + 1;
792 reginput = locinput;
793
794 /* If degenerate scan matches "", assume scan done. */
795
796 if (locinput == cc->lastloc) {
797 regcc = cc->oldcc;
798 ln = regcc->cur;
799 if (regmatch(cc->next))
800 return TRUE;
801 regcc->cur = ln;
802 regcc = cc;
803 return FALSE;
804 }
805
806 /* First just match a string of min scans. */
807
808 if (n < cc->min) {
809 cc->cur = n;
810 cc->lastloc = locinput;
811 return regmatch(cc->scan);
812 }
813
814 /* Prefer next over scan for minimal matching. */
815
816 if (cc->minmod) {
817 regcc = cc->oldcc;
818 ln = regcc->cur;
819 if (regmatch(cc->next))
820 return TRUE; /* All done. */
821 regcc->cur = ln;
822 regcc = cc;
823
824 if (n >= cc->max) /* Maximum greed exceeded? */
825 return FALSE;
a687059c 826
a0d0e21e
LW
827 /* Try scanning more and see if it helps. */
828 reginput = locinput;
829 cc->cur = n;
830 cc->lastloc = locinput;
831 return regmatch(cc->scan);
832 }
833
834 /* Prefer scan over next for maximal matching. */
835
836 if (n < cc->max) { /* More greed allowed? */
837 regcppush(cc->parenfloor);
838 cc->cur = n;
839 cc->lastloc = locinput;
840 if (regmatch(cc->scan))
841 return TRUE;
842 regcppop(); /* Restore some previous $<digit>s? */
843 reginput = locinput;
844 }
845
846 /* Failed deeper matches of scan, so see if this one works. */
847 regcc = cc->oldcc;
848 ln = regcc->cur;
849 if (regmatch(cc->next))
850 return TRUE;
851 regcc->cur = ln;
852 regcc = cc;
853 return FALSE;
854 }
855 /* NOT REACHED */
856 case BRANCH: {
857 if (OP(next) != BRANCH) /* No choice. */
858 next = NEXTOPER(scan);/* Avoid recursion. */
859 else {
748a9306 860 int lastparen = *reglastparen;
a0d0e21e
LW
861 do {
862 reginput = locinput;
863 if (regmatch(NEXTOPER(scan)))
864 return 1;
748a9306
LW
865 for (n = *reglastparen; n > lastparen; n--)
866 regendp[n] = 0;
867 *reglastparen = n;
868
a687059c 869#ifdef REGALIGN
a0d0e21e
LW
870 /*SUPPRESS 560*/
871 if (n = NEXT(scan))
872 scan += n;
873 else
874 scan = NULL;
a687059c 875#else
a0d0e21e 876 scan = regnext(scan);
79072805 877#endif
a0d0e21e
LW
878 } while (scan != NULL && OP(scan) == BRANCH);
879 return 0;
880 /* NOTREACHED */
a687059c 881 }
a0d0e21e
LW
882 }
883 break;
884 case MINMOD:
885 minmod = 1;
886 break;
887 case CURLY:
888 ln = ARG1(scan); /* min to match */
889 n = ARG2(scan); /* max to match */
890 scan = NEXTOPER(scan) + 4;
891 goto repeat;
892 case STAR:
893 ln = 0;
894 n = 32767;
895 scan = NEXTOPER(scan);
896 goto repeat;
897 case PLUS:
898 /*
899 * Lookahead to avoid useless match attempts
900 * when we know what character comes next.
901 */
902 ln = 1;
903 n = 32767;
904 scan = NEXTOPER(scan);
905 repeat:
906 if (OP(next) == EXACTLY)
907 nextchar = *(OPERAND(next)+1);
908 else
909 nextchar = -1000;
910 reginput = locinput;
911 if (minmod) {
912 minmod = 0;
913 if (ln && regrepeat(scan, ln) < ln)
914 return 0;
8e07c86e 915 while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
a0d0e21e
LW
916 /* If it could work, try it. */
917 if (nextchar == -1000 || *reginput == nextchar)
918 if (regmatch(next))
919 return 1;
920 /* Couldn't or didn't -- back up. */
748a9306 921 reginput = locinput + ln;
a0d0e21e
LW
922 if (regrepeat(scan, 1)) {
923 ln++;
924 reginput = locinput + ln;
925 }
926 else
927 return 0;
928 }
929 }
930 else {
931 n = regrepeat(scan, n);
932 if (ln < n && regkind[(U8)OP(next)] == EOL &&
933 (!multiline || OP(next) == SEOL))
934 ln = n; /* why back off? */
935 while (n >= ln) {
936 /* If it could work, try it. */
937 if (nextchar == -1000 || *reginput == nextchar)
938 if (regmatch(next))
939 return 1;
940 /* Couldn't or didn't -- back up. */
941 n--;
942 reginput = locinput + n;
943 }
944 }
945 return 0;
946 case SUCCEED:
947 case END:
948 reginput = locinput; /* put where regtry can find it */
949 return 1; /* Success! */
950 case IFMATCH:
951 reginput = locinput;
952 scan = NEXTOPER(scan);
953 if (!regmatch(scan))
954 return 0;
955 break;
956 case UNLESSM:
957 reginput = locinput;
958 scan = NEXTOPER(scan);
959 if (regmatch(scan))
960 return 0;
961 break;
962 default:
963 fprintf(stderr, "%x %d\n",(unsigned)scan,scan[1]);
964 FAIL("regexp memory corruption");
a687059c 965 }
a0d0e21e
LW
966 scan = next;
967 }
a687059c 968
a0d0e21e
LW
969 /*
970 * We get here only if there's trouble -- normally "case END" is
971 * the terminating point.
972 */
973 FAIL("corrupted regexp pointers");
974 /*NOTREACHED*/
975 return 0;
a687059c
LW
976}
977
978/*
979 - regrepeat - repeatedly match something simple, report how many
980 */
981/*
982 * [This routine now assumes that it will only match on things of length 1.
983 * That was true before, but now we assume scan - reginput is the count,
984 * rather than incrementing count on every character.]
985 */
79072805 986static I32
00bf170e 987regrepeat(p, max)
a687059c 988char *p;
79072805 989I32 max;
a687059c 990{
a0d0e21e
LW
991 register char *scan;
992 register char *opnd;
993 register I32 c;
994 register char *loceol = regeol;
995
996 scan = reginput;
997 if (max != 32767 && max < loceol - scan)
998 loceol = scan + max;
999 opnd = OPERAND(p);
1000 switch (OP(p)) {
1001 case ANY:
1002 while (scan < loceol && *scan != '\n')
1003 scan++;
1004 break;
1005 case SANY:
1006 scan = loceol;
1007 break;
1008 case EXACTLY: /* length of string is 1 */
1009 opnd++;
1010 while (scan < loceol && *opnd == *scan)
1011 scan++;
1012 break;
1013 case ANYOF:
1014 c = UCHARAT(scan);
1015 while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
1016 scan++;
1017 c = UCHARAT(scan);
a687059c 1018 }
a0d0e21e
LW
1019 break;
1020 case ALNUM:
1021 while (scan < loceol && isALNUM(*scan))
1022 scan++;
1023 break;
1024 case NALNUM:
1025 while (scan < loceol && !isALNUM(*scan))
1026 scan++;
1027 break;
1028 case SPACE:
1029 while (scan < loceol && isSPACE(*scan))
1030 scan++;
1031 break;
1032 case NSPACE:
1033 while (scan < loceol && !isSPACE(*scan))
1034 scan++;
1035 break;
1036 case DIGIT:
1037 while (scan < loceol && isDIGIT(*scan))
1038 scan++;
1039 break;
1040 case NDIGIT:
1041 while (scan < loceol && !isDIGIT(*scan))
1042 scan++;
1043 break;
1044 default: /* Called on something of 0 width. */
1045 break; /* So match right here or not at all. */
1046 }
a687059c 1047
a0d0e21e
LW
1048 c = scan - reginput;
1049 reginput = scan;
a687059c 1050
a0d0e21e 1051 return(c);
a687059c
LW
1052}
1053
1054/*
1055 - regnext - dig the "next" pointer out of a node
1056 *
1057 * [Note, when REGALIGN is defined there are two places in regmatch()
1058 * that bypass this code for speed.]
1059 */
1060char *
1061regnext(p)
1062register char *p;
1063{
a0d0e21e 1064 register I32 offset;
a687059c 1065
a0d0e21e
LW
1066 if (p == &regdummy)
1067 return(NULL);
a687059c 1068
a0d0e21e
LW
1069 offset = NEXT(p);
1070 if (offset == 0)
1071 return(NULL);
a687059c
LW
1072
1073#ifdef REGALIGN
a0d0e21e 1074 return(p+offset);
a687059c 1075#else
a0d0e21e
LW
1076 if (OP(p) == BACK)
1077 return(p-offset);
1078 else
1079 return(p+offset);
a687059c
LW
1080#endif
1081}